Merge pull request #26618 from trevor-m:tmorris_tftrt_dont_rebuild_failed_engines

PiperOrigin-RevId: 238143344
diff --git a/configure.py b/configure.py
index 3dc0cb2..08684e8 100644
--- a/configure.py
+++ b/configure.py
@@ -1305,11 +1305,14 @@
         all_valid = False
       else:
         ver = float(m.group(0))
-        if ver < 3.5:
-          print('ERROR: TensorFlow only supports CUDA compute capabilities 3.5 '
+        if ver < 3.0:
+          print('ERROR: TensorFlow only supports CUDA compute capabilities 3.0 '
                 'and higher. Please re-specify the list of compute '
                 'capabilities excluding version %s.' % ver)
           all_valid = False
+        if ver < 3.5:
+          print('WARNING: XLA does not support CUDA compute capabilities '
+                'lower than 3.5. Disable XLA when running on older GPUs.')
 
     if all_valid:
       break
@@ -1611,7 +1614,7 @@
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.19.0', '0.23.1')
+  check_bazel_version('0.19.0', '0.23.2')
 
   reset_tf_configure_bazelrc()
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 3f46f71..ba13e78 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -198,6 +198,7 @@
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:fifo_queue",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:host_constant_op",
         "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:no_op",
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 09e04d2..48bd2f1 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -29,6 +29,7 @@
 #include "tensorflow/core/kernels/data/prefetch_dataset_op.h"
 #include "tensorflow/core/kernels/fifo_queue.h"
 #include "tensorflow/core/kernels/function_ops.h"
+#include "tensorflow/core/kernels/host_constant_op.h"
 #include "tensorflow/core/kernels/identity_n_op.h"
 #include "tensorflow/core/kernels/identity_op.h"
 #include "tensorflow/core/kernels/no_op.h"
@@ -94,6 +95,8 @@
       Name("Const").Device(DEVICE).TypeConstraint("dtype", TYPES),             \
       ConstantOp);                                                             \
   REGISTER_KERNEL_BUILDER(                                                     \
+      Name("HostConst").Device(DEVICE).HostMemory("output"), _HostConstantOp); \
+  REGISTER_KERNEL_BUILDER(                                                     \
       Name("Identity").Device(DEVICE).TypeConstraint("T", TYPES), IdentityOp); \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Identity").Device(DEVICE).TypeConstraint("T", DT_STRING),          \
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 0a5512c..3db98f7 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -892,7 +892,7 @@
 
 tf_xla_py_test(
     name = "tensor_array_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["tensor_array_ops_test.py"],
     # TensorArray ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index c4d3676..a7fb772 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -11,6 +11,7 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_shared_object",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
@@ -46,19 +47,6 @@
     ]),
 )
 
-tf_custom_op_library(
-    name = "python/ops/_trt_ops.so",
-    srcs = [
-        "ops/get_serialized_resource_op.cc",
-        "ops/trt_engine_op.cc",
-    ],
-    deps = [
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
 cc_library(
     name = "trt_op_kernels",
     srcs = [
@@ -86,6 +74,22 @@
     alwayslink = 1,
 )
 
+tf_cc_shared_object(
+    name = "python/ops/libtftrt.so",
+    srcs = [
+        "ops/get_serialized_resource_op.cc",
+        "ops/trt_engine_op.cc",
+    ],
+    copts = tf_copts(is_external = True),
+    linkopts = ["-lm"],
+    deps = [
+        ":trt_op_kernels",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
 tf_cuda_cc_test(
     name = "get_serialized_resource_op_test",
     size = "small",
@@ -149,7 +153,7 @@
     name = "trt_ops_loader",
     srcs = ["python/ops/trt_ops.py"],
     dso = [
-        "python/ops/_trt_ops.so",
+        "python/ops/libtftrt.so",
     ] + if_tensorrt([
         "@local_config_tensorrt//:tensorrt",
     ]),
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 713bcbb..f2a6b74 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -331,9 +331,13 @@
   // Construct the const nodes first.
   subgraph_nodes.insert(subgraph_nodes.begin(), added_const_nodes.begin(),
                         added_const_nodes.end());
+  string scope_name;
   TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
       g, graph_properties, subgraph_nodes, &info->connections,
-      &info->segment_graph_def, &info->engine_name));
+      &info->segment_graph_def, &scope_name));
+  info->engine_name = StrCat(scope_name, info->engine_name);
+  VLOG(1) << "Converted TensorRT candidate segment '" << info->engine_name
+          << "' to a GraphDef";
   // TODO(sami): This should not happen once segmenter is updated.
   if (segment_devices.size() == 1) {
     info->device = *segment_devices.begin();
@@ -492,8 +496,7 @@
   // these segments.
   if (inputs.empty()) {
     return errors::Internal(
-        "Segment has no inputs (possible "
-        "constfold failure)");
+        "Segment has no inputs (possible constfold failure)");
   }
 
   const bool calibrate_int8 =
@@ -839,6 +842,7 @@
   for (size_t t = 0; t < initial_segments.size(); t++) {
     auto& curr_segment = initial_segments.at(t);
     EngineInfo curr_engine;
+    curr_engine.engine_name = StrCat("TRTEngineOp_", t);
     Status status =
         GetEngineInfo(&graph, *params.graph_properties, curr_segment.first,
                       node_map, reverse_topo_order, &curr_engine);
@@ -854,7 +858,6 @@
     curr_engine.use_calibration = params.use_calibration;
     curr_engine.cached_engine_batches = params.cached_engine_batches;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
-    StrAppend(&curr_engine.engine_name, "TRTEngineOp_", t);
     if (params.use_function_backup) {
       status = RegisterSegmentFunctionToFunctionLibrary(
           &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index f34ecce..45c58d2 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -311,31 +311,31 @@
   }
 
   const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
-  auto compute_output_dims =
-      [](const TRT_TensorOrWeights& input, int broadcast_num_dims,
-         int* output_dims_array, nvinfer1::Dims* output_dims) {
-        const nvinfer1::Dims input_dims = input.GetTrtDims();
-        std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
-        std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
-                  output_dims_array + broadcast_num_dims - input_dims.nbDims);
-        if (input.is_tensor()) {
-          const int true_input_dims = input_dims.nbDims + 1;
-          if (true_input_dims < broadcast_num_dims) {
-            return errors::InvalidArgument(
-                "Broadcasting beyond batch dimension is not supported ",
-                "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
-                broadcast_num_dims, ")");
-          }
-          // Set the batch dimension to -1, since batch size is not supposed to
-          // be broadcasted.
-          output_dims_array[0] = -1;
-        }
-        // Copy to output dimensions (stripping the batch dimension).
-        output_dims->nbDims = broadcast_num_dims - 1;
-        std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
-                  output_dims->d);
-        return Status::OK();
-      };
+  auto compute_output_dims = [](const TRT_TensorOrWeights& input,
+                                int broadcast_num_dims, int* output_dims_array,
+                                nvinfer1::Dims* output_dims) {
+    const nvinfer1::Dims input_dims = input.GetTrtDims();
+    std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
+    std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
+              output_dims_array + broadcast_num_dims - input_dims.nbDims);
+    if (input.is_tensor()) {
+      const int true_input_dims = input_dims.nbDims + 1;
+      if (true_input_dims < broadcast_num_dims) {
+        return errors::InvalidArgument(
+            "Broadcasting beyond batch dimension is not supported ",
+            "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
+            broadcast_num_dims, ")");
+      }
+      // Set the batch dimension to -1, since batch size is not supposed to
+      // be broadcasted.
+      output_dims_array[0] = -1;
+    }
+    // Copy to output dimensions (stripping the batch dimension).
+    output_dims->nbDims = broadcast_num_dims - 1;
+    std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
+              output_dims->d);
+    return Status::OK();
+  };
 
   // Compute the output dimensions.
   const int broadcast_num_dims =
@@ -367,11 +367,13 @@
   if (!layer) return nullptr;
   const nvinfer1::DataType trt_dtype = trt_weights.type;
   nvinfer1::ITensor* trt_tensor = layer->getOutput(0);
+#if !IS_TRT_VERSION_GE(5, 1, 3)
   // TODO(laigd): there is a bug in TensorRT 5.0 library that, if we don't set
   // the data type below, it will always be kFLOAT regardless what the data type
   // of the weights is. Once NVIDIA fixes this bug, we should remove the data
   // type setting logic below and test should still pass.
   trt_tensor->setType(trt_dtype);
+#endif
   return trt_tensor;
 }
 
@@ -574,13 +576,13 @@
 
   void setLocation(nvinfer1::TensorLocation location) override {}
 
-#if NV_TENSORRT_MAJOR >= 5
+#if IS_TRT_VERSION_GE(5, 0, 0)
   bool setDynamicRange(float min, float max) override { return true; }
 
   float getDynamicRange() const override { return 0; }
 #endif
 
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+#if IS_TRT_VERSION_GE(5, 1, 0)
   bool dynamicRangeIsSet() const override { return true; }
 
   void resetDynamicRange() override {}
@@ -1281,7 +1283,7 @@
   // Infer ranges across marked ops.
   PropagateQuantizationRanges();
   // Apply ranges.
-#if NV_TENSORRT_MAJOR >= 5
+#if IS_TRT_VERSION_GE(5, 0, 0)
   for (auto pair : quantization_ranges_) {
     nvinfer1::ITensor* tensor = pair.first;
     const float range = pair.second;
@@ -1923,6 +1925,7 @@
       {"RealDiv", nvinfer1::ElementWiseOperation::kDIV},
       {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
       {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
+      {"Pow", nvinfer1::ElementWiseOperation::kPOW},
   };
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end()) {
@@ -2296,7 +2299,9 @@
   }
 // TRT 5.1 adds a slice layer. For older versions, we attempt to use the
 // padding layer with negative padding.
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+#if IS_TRT_VERSION_GE(5, 1, 0) && 0
+  // TODO(laigd): TRT 5.1 RC has a bug when ISliceLayer is used along with
+  // IConcatenationLayer, so disable ISliceLayer for now until it's fixed.
   // Use ISliceLayer.
   nvinfer1::Dims begin_dims, size_dims, stride_dims;
   TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(begin, &begin_dims,
@@ -3215,7 +3220,7 @@
             {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
             {"Abs", nvinfer1::UnaryOperation::kABS},
             {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+#if IS_TRT_VERSION_GE(5, 1, 0)
             {"Sin", nvinfer1::UnaryOperation::kSIN},
             {"Cos", nvinfer1::UnaryOperation::kCOS},
             {"Tan", nvinfer1::UnaryOperation::kTAN},
@@ -4001,7 +4006,7 @@
     (*registration)[quantization_op_type] = ConvertQuantize;
   }
   for (auto binary_op_type :
-       {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum"}) {
+       {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum", "Pow"}) {
     (*registration)[binary_op_type] = ConvertBinary;
   }
   for (auto activation_op_type : {"Relu", "Sigmoid", "Tanh"}) {
@@ -4144,7 +4149,7 @@
     const Graph* graph, const grappler::GraphProperties& graph_properties,
     const std::vector<const Node*>& subgraph_nodes,  // In topological order
     std::vector<EngineConnection>* connections, GraphDef* segment_def,
-    string* common_scope) {
+    string* scope_name) {
   std::set<string> marker_nodes;
   // Update connection shapes/data types and add corresponding input/output
   // nodes in the segment graphdef.
@@ -4277,9 +4282,7 @@
       snode->mutable_input()->RemoveLast();
     }
   }
-  *common_scope = local_scope;
-  VLOG(1) << "Converted TensorRT candidate segment @scope '" << local_scope
-          << "' to a GraphDef";
+  *scope_name = local_scope;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 6333d91..068482a 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -43,6 +43,12 @@
 
 namespace convert {
 
+#define IS_TRT_VERSION_GE(major, minor, patch)                  \
+  ((NV_TENSORRT_MAJOR > major) ||                               \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR > minor) || \
+   (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \
+    NV_TENSORRT_PATCH >= patch))
+
 struct EngineConnection {
   // Constructs a non-control edge.
   EngineConnection(const string& outside, int out_id, int out_port,
@@ -123,13 +129,14 @@
 //   topological order.
 // - segment_def: the output GraphDef, whose non-input/output nodedefs will be
 //   sorted in topological order.
+// - scope_name: the name of the scope where the TRTEngineOp will be placed.
 //
 // TODO(aaroey): add tests to validate these properties.
 Status ConvertSegmentToGraphDef(
     const Graph* graph, const grappler::GraphProperties& graph_properties,
     const std::vector<const Node*>& subgraph_nodes,
     std::vector<EngineConnection>* connections, GraphDef* segment_def,
-    string* common_scope);
+    string* scope_name);
 
 // Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff
 // 'builder' successfully build the engine. If the result is not ok, 'engine'
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 5e3177e..bd656b0 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -233,7 +233,7 @@
     location_ = location;
   }
 
-#if NV_TENSORRT_MAJOR >= 5
+#if IS_TRT_VERSION_GE(5, 0, 0)
   bool setDynamicRange(float min, float max) override {
     dynamic_range_ = std::max(std::abs(min), std::abs(max));
     return true;
@@ -242,7 +242,7 @@
   float getDynamicRange() const override { return dynamic_range_; }
 #endif
 
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+#if IS_TRT_VERSION_GE(5, 1, 0)
   bool dynamicRangeIsSet() const override { return true; }
 
   void resetDynamicRange() override {}
@@ -845,7 +845,7 @@
 
   // Input range should be inferred along the chain and applied to tensors.
   int8_converter.MaybeApplyQuantizationRanges();
-#if NV_TENSORRT_MAJOR >= 5
+#if IS_TRT_VERSION_GE(5, 0, 0)
   EXPECT_EQ(input.getDynamicRange(), 5.0f);
   EXPECT_EQ(infer_1.getDynamicRange(), 5.0f);
   EXPECT_EQ(infer_2.getDynamicRange(), 5.0f);
@@ -1964,6 +1964,10 @@
   } else if (node_def.op() == "Maximum") {
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(3), CType(6), CType(3), CType(6)));
+  } else if (node_def.op() == "Pow") {
+    ExpectArrayNear(
+        std::vector<CType>{CType(9), CType(36), CType(27), CType(216)},
+        GetSpanForData<CType>(output_data[0]));
   } else {
     ASSERT_TRUE(false);
   }
@@ -2037,6 +2041,7 @@
   TestBinaryTensorOpTensor<ops::RealDiv, DT_FLOAT>(this);
   TestBinaryTensorOpTensor<ops::Minimum, DT_FLOAT>(this);
   TestBinaryTensorOpTensor<ops::Maximum, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Pow, DT_FLOAT>(this);
 
   TestBinaryTensorOpTensor<ops::Add, DT_HALF>(this);
   TestBinaryTensorOpTensor<ops::Sub, DT_HALF>(this);
@@ -2045,6 +2050,7 @@
   TestBinaryTensorOpTensor<ops::RealDiv, DT_HALF>(this);
   TestBinaryTensorOpTensor<ops::Minimum, DT_HALF>(this);
   TestBinaryTensorOpTensor<ops::Maximum, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Pow, DT_HALF>(this);
 }
 
 TEST_F(OpConverterTest, ConvertQuantize) {
@@ -2666,7 +2672,7 @@
     RunValidationAndConversion(node_def);
   }
 // TRT 5.1+ supports strides
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+#if IS_TRT_VERSION_GE(5, 1, 0)
   {
     // Negative strides, should fail.
     Reset();
@@ -2729,7 +2735,7 @@
   // Same input is used for all tests.
   const std::vector<float> ok_input = {1, 2, 3, 4, 5, 6};
 
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+#if IS_TRT_VERSION_GE(5, 1, 0)
   const int kStridedSliceOKCases = 23;
 #else
   const int kStridedSliceOKCases = 19;
@@ -2856,7 +2862,7 @@
                /*end_mask=*/get_mask({1, 0, 0, 0}),
                /*expected_output_dims=*/{1, 2, 3},
                /*expected_output=*/{1, 2, 3, 4, 5, 6}},
-#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+#if IS_TRT_VERSION_GE(5, 1, 0)
     // Strides
     TestParams{/*input_dims=*/{6},
                /*begin=*/{0, 0}, /*end=*/{0, 5}, /*strides=*/{1, 2},
diff --git a/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
index 25fb3a1..62ac5a5 100644
--- a/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
+++ b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
@@ -23,19 +23,19 @@
 import platform
 from tensorflow.python.framework import errors
 
-_trt_ops_so = None
+_tf_trt_so = None
 _module_lock = threading.Lock()
 
 
 def load_trt_ops():
   """Load TF-TRT op libraries so if it hasn't been loaded already."""
-  global _trt_ops_so
+  global _tf_trt_so
 
   if platform.system() == "Windows":
     raise RuntimeError("Windows platforms are not supported")
 
   with _module_lock:
-    if _trt_ops_so:
+    if _tf_trt_so:
       return
 
     try:
@@ -56,8 +56,8 @@
       from tensorflow.python.platform import resource_loader
       # pylint: enable=g-import-not-at-top
 
-      _trt_ops_so = load_library.load_op_library(
-          resource_loader.get_path_to_datafile("_trt_ops.so"))
+      _tf_trt_so = load_library.load_op_library(
+          resource_loader.get_path_to_datafile("libtftrt.so"))
     except errors.NotFoundError as e:
       no_trt_message = (
           "**** Failed to initialize TensorRT. This is either because the "
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 9cab9d7..593b991 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -668,10 +668,13 @@
     const string& segment_root = itr.first;
     // Return format does not require set comparator.
     std::set<const Node*> segment_nodes(itr.second.begin(), itr.second.end());
-    if (VLOG_IS_ON(1)) {
-      string s = "parent=" + segment_root + ":";
-      for (auto node : segment_nodes) s += " " + node->name();
-      VLOG(1) << "Segment " << segments->size() << ": " << s;
+    if (VLOG_IS_ON(1) && !segment_nodes.empty()) {
+      string s;
+      for (auto node : segment_nodes) {
+        StrAppend(&s, "\n[Op type: ", node->type_string(), "] ", node->name());
+      }
+      VLOG(1) << "Nodes in segment " << segments->size()
+              << " with parent=" << segment_root << ":" << s;
     }
 
     // Don't use small segments.
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 36bef22..ef74925 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -33,6 +33,7 @@
         "diag_op.cc",
         "dynamic_slice_ops.cc",
         "dynamic_stitch_op.cc",
+        "einsum_op.cc",
         "elu_op.cc",
         "empty_op.cc",
         "extract_image_patches_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/einsum_op.cc b/tensorflow/compiler/tf2xla/kernels/einsum_op.cc
new file mode 100644
index 0000000..6b3334d
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/einsum_op.cc
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr std::array<DataType, 2> kEinsumTypes = {{DT_BFLOAT16, DT_FLOAT}};
+
+class EinsumOp : public XlaOpKernel {
+ public:
+  explicit EinsumOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("equation", &equation_));
+  }
+
+  ~EinsumOp() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp lhs = ctx->Input(0);
+    xla::XlaOp rhs = ctx->Input(1);
+    const TensorShape a_shape = ctx->InputShape(0);
+    const TensorShape b_shape = ctx->InputShape(1);
+    ctx->SetOutput(0, xla::Einsum(lhs, rhs, equation_));
+  }
+
+ private:
+  string equation_;
+  TF_DISALLOW_COPY_AND_ASSIGN(EinsumOp);
+};
+
+REGISTER_XLA_OP(Name("XlaEinsum").TypeConstraint("T", kEinsumTypes), EinsumOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 2c430e3..5ac288d 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -67,6 +67,13 @@
       }
       for (int i = 0; i < fft_rank_; i++) {
         int index = input_shape.dims() - fft_rank_ + i;
+        OP_REQUIRES(
+            ctx,
+            input_shape.dim_size(index) == 0 ||
+                input_shape.dim_size(index) >= expected_sizes[i],
+            errors::InvalidArgument(
+                "Input dimension ", index, " must have length of at least ",
+                expected_sizes[i], " but got: ", input_shape.dim_size(index)));
         if (input_shape.dim_size(index) > expected_sizes[i]) {
           slice_sizes[index] = expected_sizes[i];
         } else {
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 7140b6a..cb6e0fb 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -17,6 +17,7 @@
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index e5f0969..f13ac88 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -14,6 +14,7 @@
 ==============================================================================*/
 
 #include "absl/algorithm/container.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -506,5 +507,73 @@
      is faster when input is large and rank of input is higher than 1.
 )doc");
 
+REGISTER_OP("XlaEinsum")
+    .Input("a: T")
+    .Input("b: T")
+    .Output("product: T")
+    .Attr("equation: string")
+    .Attr("T: {bfloat16, float}")
+    .SetShapeFn([](shape_inference::InferenceContext* context) {
+      shape_inference::ShapeHandle input_a = context->input(0);
+      shape_inference::ShapeHandle input_b = context->input(1);
+
+      int64 rank_a, rank_b;
+      if (context->RankKnown(input_a)) {
+        rank_a = context->Rank(input_a);
+      } else {
+        return errors::InvalidArgument("input 0's rank is unknown.");
+      }
+      if (context->RankKnown(input_b)) {
+        rank_b = context->Rank(input_b);
+      } else {
+        return errors::InvalidArgument("input 1's rank is unknown.");
+      }
+      string equation;
+      TF_RETURN_IF_ERROR(context->GetAttr("equation", &equation));
+
+      std::map<char, shape_inference::DimensionHandle> left_map;
+      std::map<char, shape_inference::DimensionHandle> right_map;
+      std::vector<shape_inference::DimensionHandle> dims;
+
+      std::vector<string> equation_split = absl::StrSplit(equation, "->");
+
+      if (equation_split.size() != 2) {
+        return errors::InvalidArgument("Expected one \"->\" in equation. Got: ",
+                                       equation);
+      }
+
+      std::vector<string> lhs_rhs_split =
+          absl::StrSplit(equation_split[0], ',');
+      if (lhs_rhs_split.size() != 2) {
+        return errors::InvalidArgument("Expected one \",\" in equation. Got: ",
+                                       equation);
+      }
+      for (const char& c : lhs_rhs_split[0]) {
+        left_map[c] = context->Dim(input_a, left_map.size());
+      }
+      for (const char& c : lhs_rhs_split[1]) {
+        right_map[c] = context->Dim(input_b, right_map.size());
+      }
+
+      for (const char& c : equation_split[1]) {
+        if (left_map.count(c)) {
+          dims.push_back(left_map[c]);
+        } else if (right_map.count(c)) {
+          dims.push_back(right_map[c]);
+        } else {
+          return errors::InvalidArgument("Invalid equation: ", equation);
+        }
+      }
+
+      context->set_output(0, context->MakeShape(dims));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+An op which supports basic einsum op with 2 inputs and 1 output.
+
+This op has better TPU performnce since it doesn't have explicitly reshape and
+transpose operations as tf.einsum does.
+)doc");
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 19c5333..8732ee0 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -304,6 +304,27 @@
 
 dynamic_slice = gen_xla_ops.xla_dynamic_slice
 dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice
+einsum = gen_xla_ops.xla_einsum
+
+
+@ops.RegisterGradient('XlaEinsum')
+def _einsum_grad(op, grad):
+  equation = op.get_attr('equation')
+  inputs, output = equation.split('->')
+  left, right = inputs.split(',')
+
+  return [
+      gen_xla_ops.xla_einsum(
+          grad,
+          op.inputs[1],
+          equation='{},{}->{}'.format(output, right, left),
+          name=None),
+      gen_xla_ops.xla_einsum(
+          grad,
+          op.inputs[0],
+          equation='{},{}->{}'.format(output, left, right),
+          name=None)
+  ]
 
 # TODO(phawkins): generalize tf.pad to support interior padding, and then remove
 # the XLA-specific pad operator.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 93a5d9d..e5e4bf8 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -114,7 +114,7 @@
   // Collect all _Arg nodes.
   std::unordered_map<int, Node*> arg_nodes;
   for (Node* n : g->op_nodes()) {
-    if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
+    if (n->IsArg()) {
       int index;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       arg_nodes[index] = n;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 8758191..86a2517 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -92,14 +92,14 @@
   std::map<int, int> arg_cores;
   std::map<int, int> retval_cores;
   for (const Node* n : graph.nodes()) {
-    if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
+    if (n->IsArg()) {
       TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
       if (core < 0) continue;
       int index;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       TF_RET_CHECK(index >= 0) << "Negative _Arg index";
       arg_cores[index] = core;
-    } else if (n->type_string() == FunctionLibraryDefinition::kRetOp) {
+    } else if (n->IsRetval()) {
       TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
       if (core < 0) continue;
       int index;
@@ -581,16 +581,14 @@
   // lowest-numbered core that consumes the argument. We choose the
   // lowest-numbered core so the assignment is deterministic.
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) ==
-        FunctionLibraryDefinition::kArgOp) {
+    if (n->IsArg()) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/true));
     }
   }
   // Do _Retval as a second loop, in case the retval's input is an _Arg (which
   // may have gotten a device assignment from the first loop).
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) ==
-        FunctionLibraryDefinition::kRetOp) {
+    if (n->IsRetval()) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/false));
     }
   }
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 07bf937..ea33780 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -846,6 +846,7 @@
     deps =
         [
             ":parse_flags_from_env",
+            ":status",
             "//tensorflow/compiler/xla:xla_proto",
             "//tensorflow/compiler/xla/service:hlo",
             "//tensorflow/core:framework_internal",
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index ae1a459..d5ade8f 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -118,6 +118,7 @@
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:local_service",
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index ec0e089..f2d124d 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -71,9 +71,8 @@
   }
   return absl::StrFormat(
       "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
-      "generate_hlo_graph=%s, num_replicas=%d}",
-      device_ordinal_, result_layout, debug_options().xla_generate_hlo_graph(),
-      num_replicas_);
+      "num_replicas=%d}",
+      device_ordinal_, result_layout, num_replicas_);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 48b5f94..ae7d3d9 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Triple.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
@@ -185,7 +186,7 @@
       executable_->ExecuteOnStream(run_options, arguments,
                                    /*hlo_execution_profile=*/nullptr));
   TF_RETURN_IF_ERROR(RecordResult(&result, executable_->hlo_snapshot()));
-  TF_RETURN_IF_ERROR(executable_->DumpHloSnapshot());
+  DumpHloSnapshotIfEnabled(executable_->module(), *executable_->hlo_snapshot());
   return std::move(result);
 }
 
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 4f4fc8d..2dd8c13 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -67,10 +67,10 @@
       const ExecutableRunOptions& run_options, const Backend& backend);
 
   // Records the computation in a SessionModule proto with the arguments used to
-  // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
+  // invoke it, and the result. Enabled by flag: --xla_dump_hlo_snapshots.
   //
-  // The given ServiceExecutableRunOptions override any values from TF_XLA_FLAGS
-  // environment variable.
+  // The given ServiceExecutableRunOptions override any values from the
+  // XLA_FLAGS environment variable.
   StatusOr<ScopedShapedBuffer> ExecuteAndDump(
       const ServiceExecutableRunOptions* run_options,
       const absl::Span<const ShapedBuffer* const> arguments);
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 43d9ee0..aaef4a1 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -33,7 +33,7 @@
   opts.set_xla_cpu_multi_thread_eigen(true);
   opts.set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   opts.set_xla_eliminate_hlo_implicit_broadcast(true);
-  opts.set_xla_hlo_dump_as_html(false);
+  opts.set_xla_dump_hlo_as_html(false);
 #ifdef INTEL_MKL
   opts.set_xla_cpu_use_mkl_dnn(true);
 #endif  // INTEL_MKL
@@ -84,6 +84,14 @@
     };
   };
 
+  auto string_setter_for =
+      [](void (DebugOptions::*member_setter)(const string& value)) {
+        return [member_setter](const string& value) {
+          (flag_values->*member_setter)(value);
+          return true;
+        };
+      };
+
   // Custom "sub-parser" lambda for xla_disable_hlo_passes.
   auto setter_for_xla_disable_hlo_passes = [](string comma_separated_values) {
     std::vector<string> disabled_passes =
@@ -115,38 +123,6 @@
 
   flag_objects = new std::vector<tensorflow::Flag>({
       tensorflow::Flag(
-          "xla_generate_hlo_graph",
-          flag_values->mutable_xla_generate_hlo_graph(),
-          "HLO modules matching this regex will be dumped to a .dot file "
-          "throughout various stages in compilation."),
-      tensorflow::Flag(
-          "xla_hlo_graph_addresses",
-          bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
-          flag_values->xla_hlo_graph_addresses(),
-          "With xla_generate_hlo_graph, show addresses of HLO ops in "
-          "graph dump."),
-      tensorflow::Flag(
-          "xla_hlo_graph_path", flag_values->mutable_xla_hlo_graph_path(),
-          "With xla_generate_hlo_graph, dump the graphs into this path."),
-      tensorflow::Flag("xla_hlo_dump_as_html",
-                       bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_html),
-                       flag_values->xla_hlo_dump_as_html(),
-                       "Dump HLO graphs as an HTML (DOT rendered into SVG "
-                       "inlined in HTML)."),
-      tensorflow::Flag(
-          "xla_hlo_graph_sharding_color",
-          bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
-          flag_values->xla_hlo_graph_sharding_color(),
-          "Assign colors based on sharding assignments when generating the "
-          "HLO graphs."),
-      tensorflow::Flag(
-          "xla_log_hlo_text", flag_values->mutable_xla_log_hlo_text(),
-          "HLO modules matching this regex will be dumped to LOG(INFO)."),
-      tensorflow::Flag(
-          "xla_generate_hlo_text_to",
-          flag_values->mutable_xla_generate_hlo_text_to(),
-          "Dump all HLO modules as text into the provided directory path."),
-      tensorflow::Flag(
           "xla_cpu_enable_fast_math",
           bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
           flag_values->xla_cpu_enable_fast_math(),
@@ -211,9 +187,6 @@
           flag_values->xla_embed_ir_in_executable(),
           "Embed the compiler IR as a string in the executable."),
       tensorflow::Flag(
-          "xla_dump_ir_to", flag_values->mutable_xla_dump_ir_to(),
-          "Dump the compiler IR into this directory as individual files."),
-      tensorflow::Flag(
           "xla_eliminate_hlo_implicit_broadcast",
           bool_setter_for(
               &DebugOptions::set_xla_eliminate_hlo_implicit_broadcast),
@@ -248,20 +221,6 @@
           flag_values->xla_gpu_max_kernel_unroll_factor(),
           "Specify the maximum kernel unroll factor for the GPU backend."),
       tensorflow::Flag(
-          "xla_dump_optimized_hlo_proto_to",
-          flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
-          "Dump Hlo after all hlo passes are executed as proto binary into "
-          "this directory."),
-      tensorflow::Flag(
-          "xla_dump_unoptimized_hlo_proto_to",
-          flag_values->mutable_xla_dump_unoptimized_hlo_proto_to(),
-          "Dump HLO before any hlo passes are executed as proto binary into "
-          "this directory."),
-      tensorflow::Flag("xla_dump_per_pass_hlo_proto_to",
-                       flag_values->mutable_xla_dump_per_pass_hlo_proto_to(),
-                       "Dump HLO after each pass as an HloProto in binary file "
-                       "format into this directory."),
-      tensorflow::Flag(
           "xla_test_all_output_layouts",
           bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
           flag_values->xla_test_all_output_layouts(),
@@ -283,14 +242,6 @@
           bool_setter_for(&DebugOptions::set_xla_hlo_profile),
           flag_values->xla_hlo_profile(),
           "Instrument the computation to collect per-HLO cycle counts"),
-      tensorflow::Flag("xla_dump_computations_to",
-                       flag_values->mutable_xla_dump_computations_to(),
-                       "Dump computations that XLA executes into the provided "
-                       "directory path"),
-      tensorflow::Flag("xla_dump_executions_to",
-                       flag_values->mutable_xla_dump_executions_to(),
-                       "Dump parameters and results of computations that XLA "
-                       "executes into the provided directory path"),
       tensorflow::Flag("xla_backend_extra_options",
                        setter_for_xla_backend_extra_options, "",
                        "Extra options to pass to a backend; "
@@ -343,6 +294,79 @@
               &DebugOptions::set_xla_gpu_disable_ptxas_optimizations),
           flag_values->xla_gpu_disable_ptxas_optimizations(),
           "In XLA:GPU run ptxas in -O0 (default is -O3)."),
+
+      tensorflow::Flag(
+          "xla_dump_to", string_setter_for(&DebugOptions::set_xla_dump_to),
+          flag_values->xla_dump_to(),
+          "Directory into which debugging data is written.  If not specified "
+          "but another dumping flag is passed, data will be written to stdout. "
+          " To explicitly write to stdout, set this to \"-\".  The values "
+          "\"sponge\" and \"test_undeclared_outputs_dir\" have a special "
+          "meaning: They cause us to dump into the directory specified by the "
+          "environment variable TEST_UNDECLARED_OUTPUTS_DIR."),
+      tensorflow::Flag(
+          "xla_dump_hlo_as_text",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_text),
+          flag_values->xla_dump_hlo_as_text(),
+          "Dumps HLO modules as text before and after optimizations.  Results "
+          "are written to the --xla_dump_to dir, or, if no dir is specified, "
+          "to stdout."),
+      tensorflow::Flag(
+          "xla_dump_hlo_as_proto",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_proto),
+          flag_values->xla_dump_hlo_as_proto(),
+          "Dumps HLO modules as HloProtos to the directory specified by "
+          "--xla_dump_to."),
+      tensorflow::Flag(
+          "xla_dump_hlo_as_dot",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_dot),
+          flag_values->xla_dump_hlo_as_dot(),
+          "Dumps HLO modules rendered as dot files to the directory "
+          "specified by --xla_dump_to."),
+      tensorflow::Flag("xla_dump_hlo_as_html",
+                       bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_html),
+                       flag_values->xla_dump_hlo_as_html(),
+                       "Dumps HLO modules rendered as HTML files to the "
+                       "directory specified by --xla_dump_to."),
+      tensorflow::Flag(
+          "xla_dump_hlo_as_url",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_url),
+          flag_values->xla_dump_hlo_as_url(),
+          "Tries to dump HLO modules rendered as URLs to stdout (and also to "
+          "the directory specified by --xla_dump_to). This is not implemented "
+          "by default; you need to add a plugin which calls "
+          "RegisterGraphToURLRenderer()."),
+      tensorflow::Flag(
+          "xla_dump_hlo_snapshots",
+          bool_setter_for(&DebugOptions::set_xla_dump_hlo_snapshots),
+          flag_values->xla_dump_hlo_snapshots(),
+          "Every time an HLO module is run, dumps an HloSnapshot to the "
+          "directory specified by --xla_dump_to."),
+      tensorflow::Flag(
+          "xla_dump_hlo_module_re",
+          string_setter_for(&DebugOptions::set_xla_dump_hlo_module_re),
+          flag_values->xla_dump_hlo_module_re(),
+          "Limits dumping only to modules which match this regular expression. "
+          " Default is to dump all modules."),
+      tensorflow::Flag(
+          "xla_dump_hlo_pass_re",
+          string_setter_for(&DebugOptions::set_xla_dump_hlo_pass_re),
+          flag_values->xla_dump_hlo_pass_re(),
+          "If specified, dumps HLO before and after optimization passes which "
+          "match this regular expression, in addition to dumping at the very "
+          "beginning and end of compilation."),
+      tensorflow::Flag(
+          "xla_hlo_graph_addresses",
+          bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
+          flag_values->xla_hlo_graph_addresses(),
+          "When rendering graphs (--xla_dump_hlo_as_{dot,html,url}), displays "
+          "the address in memory of each HloInstruction object."),
+      tensorflow::Flag(
+          "xla_hlo_graph_sharding_color",
+          bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
+          flag_values->xla_hlo_graph_sharding_color(),
+          "Assign colors based on sharding assignments when generating the "
+          "HLO graphs."),
   });
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
diff --git a/tensorflow/compiler/xla/g3doc/jit.md b/tensorflow/compiler/xla/g3doc/jit.md
index 85fa16c..d7ce5ee 100644
--- a/tensorflow/compiler/xla/g3doc/jit.md
+++ b/tensorflow/compiler/xla/g3doc/jit.md
@@ -144,7 +144,8 @@
 feature of XLA via an environmental variable that outputs the XLA graph.
 
 ```shell
-XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
+XLA_FLAGS="--xla_hlo_profile --xla_dump_to=/tmp/foo --xla_dump_hlo_as_text"
+python mnist_softmax_xla.py
 ```
 
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
@@ -153,28 +154,10 @@
   <img style="width:100%" src="./images/jit_timeline_gpu_xla.png">
 </div>
 
-To understand what is happening in `XlaLaunch`, look at the console output for
-statements similar to the following:
+To understand what is happening in `XlaLaunch`, look at the console output. Each
+XLA cluster that's launched will have a corresponding profile (from
+`--xla_hlo_profile`) showing how long each HLO took to run.
 
-```shell
-computation cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=1].v82 [CPU:
-pipeline start, before inline]: /tmp/hlo_graph_0.dot
-
-```
-
-The console statements point to the location of `hlo_graph_xx.dot` files that
-contain information about the graph created by XLA. The process that XLA takes
-to fuse Ops is visible by starting at `hlo_graph_0.dot` and viewing each diagram
-in succession.
-
-To Render the .dot file into a png, install
-[GraphViz](https://www.graphviz.org/download/) and run:
-
-```shell
-dot -Tpng hlo_graph_80.dot -o hlo_graph_80.png
-```
-
-The result will look like the following:
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/jit_gpu_xla_graph.png">
-</div>
+`/tmp/foo` will contain the HLO before and after optimizations for each HLO
+module that's run. You can read this as-is, or you can visualize it using
+`tensorflow/compiler/xla/tools:interactive_graphviz`.
diff --git a/tensorflow/compiler/xla/protobuf_util.cc b/tensorflow/compiler/xla/protobuf_util.cc
index ac342bf..e476015 100644
--- a/tensorflow/compiler/xla/protobuf_util.cc
+++ b/tensorflow/compiler/xla/protobuf_util.cc
@@ -38,42 +38,14 @@
   return (serialized1 == serialized2);
 }
 
-namespace {
-
-std::pair<tensorflow::mutex*, std::vector<std::function<string(string)>>*>
-GetDirectoryExpanders() {
-  static auto* mutex = new tensorflow::mutex;
-  static auto* singleton = new std::vector<std::function<string(string)>>;
-  return {mutex, singleton};
-}
-
-// Runs all the directory expanders over x and returns the result.
-string Expand(string x) {
-  auto pair = GetDirectoryExpanders();
-  tensorflow::mutex_lock lock(*pair.first);
-  for (const auto& f : *pair.second) {
-    x = f(x);
-  }
-  return x;
-}
-
-}  // namespace
-
 Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message,
                             const string& directory, const string& file_name) {
   tensorflow::Env* env = tensorflow::Env::Default();
-  string expanded_dir = Expand(directory);
-  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(expanded_dir));
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory));
   string safe_file_name = SanitizeFileName(file_name) + ".pb";
-  const string path = tensorflow::io::JoinPath(expanded_dir, safe_file_name);
+  const string path = tensorflow::io::JoinPath(directory, safe_file_name);
   return tensorflow::WriteBinaryProto(env, path, message);
 }
 
-void RegisterDirectoryExpander(const std::function<string(string)>& expander) {
-  auto pair = GetDirectoryExpanders();
-  tensorflow::mutex_lock lock(*pair.first);
-  pair.second->push_back(expander);
-}
-
 }  // namespace protobuf_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index a4934cb..ffbfa7a 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -392,10 +392,9 @@
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> hlo_module,
       HloModule::CreateFromProto(computation_.proto(), module_config));
-  hlo_graph_dumper::DotGraphOptions options;
-  options.debug_options = &hlo_module->config().debug_options();
-  return hlo_graph_dumper::HloComputationToDotGraph(
-      *hlo_module->entry_computation(), options);
+  return RenderGraph(*hlo_module->entry_computation(), /*label=*/"",
+                     hlo_module->config().debug_options(),
+                     RenderedGraphFormat::kDot);
 }
 
 StatusOr<ProgramShape> Computation::GetProgramShape() const {
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 7d5921d..a1a4f00 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -196,41 +196,38 @@
   if ($input == Py_None) {
     $1 = NULL;
   } else {
-    if (!HandleStringAttribute($input, "generate_hlo_graph", [&](string s) {
-      build_options.mutable_debug_options()->set_xla_generate_hlo_graph(std::move(s));
+    if (!HandleStringAttribute($input, "dump_to", [&](string s) {
+      build_options.mutable_debug_options()->set_xla_dump_to(std::move(s));
     })) {
       return nullptr;
     }
-    if (!HandleStringAttribute($input, "dump_optimized_hlo_proto_to", [&](string s) {
-      build_options.mutable_debug_options()->set_xla_dump_optimized_hlo_proto_to(std::move(s));
+    if (!HandleStringAttribute($input, "dump_hlo_pass_re", [&](string s) {
+      build_options.mutable_debug_options()->set_xla_dump_hlo_pass_re(std::move(s));
     })) {
       return nullptr;
     }
-    if (!HandleStringAttribute($input, "dump_unoptimized_hlo_proto_to", [&](string s) {
-      build_options.mutable_debug_options()->set_xla_dump_unoptimized_hlo_proto_to(std::move(s));
+    if (!HandleStringAttribute($input, "dump_hlo_module_re", [&](string s) {
+      build_options.mutable_debug_options()->set_xla_dump_hlo_module_re(std::move(s));
     })) {
       return nullptr;
     }
-    if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) {
-      build_options.mutable_debug_options()->set_xla_dump_per_pass_hlo_proto_to(std::move(s));
+    if (!HandleBoolAttribute($input, "dump_hlo_as_text", [&](bool b) {
+      build_options.mutable_debug_options()->set_xla_dump_hlo_as_text(b);
+    })) {
+      return nullptr;
+    }
+    if (!HandleBoolAttribute($input, "dump_hlo_as_proto", [&](bool b) {
+      build_options.mutable_debug_options()->set_xla_dump_hlo_as_proto(b);
+    })) {
+      return nullptr;
+    }
+    if (!HandleBoolAttribute($input, "hlo_profile", [&](bool b) {
+      build_options.mutable_debug_options()->set_xla_hlo_profile(b);
     })) {
       return nullptr;
     }
 
-    PyObject* o = PyObject_GetAttrString($input, "hlo_profile");
-    if (o == NULL) {
-      SWIG_fail;
-    }
-    if (o != Py_None) {
-      if (!PyBool_Check(o)) {
-        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.hlo_profile must be a bool or None.");
-        SWIG_fail;
-      }
-      build_options.mutable_debug_options()->set_xla_hlo_profile(o == Py_True);
-    }
-    Py_DECREF(o);
-
-    o = PyObject_GetAttrString($input, "result_shape");
+    PyObject* o = PyObject_GetAttrString($input, "result_shape");
     if (o == nullptr) {
       return nullptr;
     }
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index 74f45b7..de7b1e4 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -620,6 +620,32 @@
   return true;  // Handled string attribute, ok!
 }
 
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleBoolAttribute(PyObject* o, const char* attr_name,
+                         std::function<void(bool b)> f) {
+  if (!PyObject_HasAttrString(o, attr_name)) {
+    return true;  // It's ok for the object to not have the attribute.
+  }
+  PyObject* attr = PyObject_GetAttrString(o, attr_name);
+  if (attr == nullptr) {
+    return false;  // An error occurred getting the attribute.
+  }
+  if (attr == Py_None) {
+    Py_DECREF(attr);
+    return true;  // The attribute is None, which we consider ok.
+  }
+  if (!PyBool_Check(attr)) {
+    string message = absl::StrFormat("%s must be a boolean or none; got %s",
+                                     attr_name, numpy::PyObjectCppRepr(attr));
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyObject_IsTrue(attr));
+  Py_DECREF(attr);
+  return true;  // Handled boolean attribute, ok!
+}
+
 bool HandleRepeatedInt64Attribute(
     PyObject* o, const char* attr_name,
     tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field) {
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index eff8cda..d7a611d 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -143,6 +143,8 @@
 // Returns "ok"; true if there is no error, false if there was an error.
 bool HandleStringAttribute(PyObject* o, const char* attr_name,
                            std::function<void(string s)> f);
+bool HandleBoolAttribute(PyObject* o, const char* attr_name,
+                         std::function<void(bool b)> f);
 
 bool HandleRepeatedInt64Attribute(
     PyObject* o, const char* attr_name,
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index b068380..cb7d19d 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -659,11 +659,12 @@
   """
 
   def __init__(self):
-    self.generate_hlo_graph = None
-    self.dump_optimized_hlo_proto_to = None
-    self.dump_unoptimized_hlo_proto_to = None
-    self.dump_per_pass_hlo_proto_to = None
-    self.hlo_profile = False
+    self.xla_dump_to = None
+    self.dump_hlo_pass_re = None
+    self.dump_hlo_module_re = None
+    self.dump_hlo_as_text = None
+    self.dump_hlo_as_proto = None
+    self.hlo_profile = None
     self.num_replicas = get_replica_count()
 
 
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 2282691..760e3eb 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -168,6 +168,23 @@
 )
 
 cc_library(
+    name = "dump",
+    srcs = ["dump.cc"],
+    hdrs = ["dump.h"],
+    deps = [
+        ":hlo",
+        ":hlo_graph_dumper",
+        ":hlo_proto_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
     name = "shape_inference",
     srcs = ["shape_inference.cc"],
     hdrs = ["shape_inference.h"],
@@ -703,6 +720,7 @@
         ":compiler",
         ":computation_layout",
         ":device_memory_allocator",
+        ":dump",
         ":dynamic_dimension_inference",
         ":executable",
         ":execution_tracker",
@@ -782,6 +800,7 @@
         ":backend",
         ":compiler",
         ":computation_layout",
+        ":dump",
         ":platform_util",
         ":service",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -881,6 +900,7 @@
     deps = [
         ":computation_layout",
         ":device_memory_allocator",
+        ":dump",
         ":hlo",
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
@@ -2003,6 +2023,7 @@
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
@@ -2034,7 +2055,10 @@
     srcs = ["dynamic_padder_test.cc"],
     deps = [
         ":dynamic_padder",
+        ":hlo",
+        ":hlo_matchers",
         ":hlo_parser",
+        ":hlo_runner",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -2043,9 +2067,6 @@
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
@@ -2056,6 +2077,9 @@
     srcs = ["dynamic_dimension_inference_test.cc"],
     deps = [
         ":dynamic_dimension_inference",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_runner",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -2063,9 +2087,6 @@
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
@@ -2243,24 +2264,6 @@
     ],
 )
 
-tf_cc_binary(
-    name = "graphviz_example",
-    srcs = ["graphviz_example.cc"],
-    deps = [
-        ":hlo",
-        ":hlo_graph_dumper",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 tf_cc_test(
     name = "hlo_module_test",
     srcs = ["hlo_module_test.cc"],
@@ -2624,6 +2627,7 @@
     hdrs = ["copy_insertion.h"],
     deps = [
         ":buffer_liveness",
+        ":dump",
         ":hlo",
         ":hlo_alias_analysis",
         ":hlo_dce",
@@ -2879,6 +2883,7 @@
         "hlo_pass_pipeline.h",
     ],
     deps = [
+        ":dump",
         ":hlo",
         ":hlo_graph_dumper",
         ":hlo_pass",
@@ -2888,6 +2893,7 @@
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -3219,6 +3225,7 @@
     hdrs = ["hlo_module_config.h"],
     deps = [
         ":computation_layout",
+        ":computation_placer",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -3267,10 +3274,7 @@
 
 cc_library(
     name = "hlo_graph_dumper",
-    srcs = [
-        "hlo_graph_dumper.cc",
-        "hlo_graph_html_renderer.cc",
-    ],
+    srcs = ["hlo_graph_dumper.cc"],
     hdrs = ["hlo_graph_dumper.h"],
     deps = [
         ":hlo",
@@ -3280,6 +3284,7 @@
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 1965925..5209da9 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -23,6 +23,7 @@
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -70,22 +71,6 @@
     TF_RET_CHECK(instance.computation.has_host_program_shape());
 
     const DebugOptions& debug_options = options.debug_options();
-
-    // Dump computation proto if flag is set.
-    const string& directory_path = debug_options.xla_dump_computations_to();
-    if (!directory_path.empty()) {
-      HloSnapshot hlo_snapshot;
-      *hlo_snapshot.mutable_hlo()->mutable_hlo_module() = instance.computation;
-      string filename =
-          absl::StrCat("computation_", instance.computation.id(), "__",
-                       instance.computation.entry_computation_name());
-      const string& per_host_path = tensorflow::io::JoinPath(
-          directory_path, tensorflow::port::Hostname());
-
-      TF_RETURN_IF_ERROR(
-          Executable::DumpToDirectory(per_host_path, filename, hlo_snapshot));
-    }
-
     ExecutionOptions execution_options;
     *execution_options.mutable_debug_options() = debug_options;
     *execution_options.mutable_shape_with_output_layout() =
@@ -99,7 +84,7 @@
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModule> hlo_module,
         HloModule::CreateFromProto(instance.computation, *module_config));
-    TF_RETURN_IF_ERROR(MaybeDumpUnoptimizedHloModule(*hlo_module));
+    DumpHloModuleIfEnabled(*hlo_module, "before_optimizations");
     hlo_modules.push_back(std::move(hlo_module));
   }
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 79b010e..8cb64a3 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -19,6 +19,7 @@
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
@@ -956,14 +957,6 @@
   absl::flat_hash_map<const HloInstruction*, CopyNodes> copy_map_;
 };
 
-void MaybeDumpModule(const string& message, const HloModule& module) {
-  if (VLOG_IS_ON(3)) {
-    VLOG(3) << message;
-    XLA_VLOG_LINES(3, module.ToString());
-    hlo_graph_dumper::MaybeDumpHloModule(module, message);
-  }
-}
-
 }  // namespace
 
 // Add kCopy instructions to the given module to guarantee there is no
@@ -1105,8 +1098,6 @@
 
 Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering,
                                               HloModule* module) {
-  MaybeDumpModule("after adding copies to resolve interference", *module);
-
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
 
@@ -1130,8 +1121,6 @@
       }
     }
   }
-  MaybeDumpModule("after removing unnecessary copies", *module);
-
   return Status::OK();
 }
 
@@ -1160,8 +1149,6 @@
   // interference. If all copies were added in step (1) then copy removal would
   // also have to reason about things like constants and parameters live out of
   // the computation.
-  MaybeDumpModule("before copy insertion", *module);
-
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
   if (!call_graph->IsFlattened()) {
     return FailedPrecondition(
@@ -1190,23 +1177,25 @@
   HloDCE dce;
   TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
   TF_RETURN_IF_ERROR(dce.Run(module).status());
+  DumpHloModuleDuringPassIfEnabled(
+      name(), "after adding copies to resolve interference", *module);
 
   DependencyHloOrdering dep_ordering(module);
   TF_DCHECK_OK(VerifyNoLiveRangeInterference(dep_ordering, module));
 
   TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(dep_ordering, module));
+  DumpHloModuleDuringPassIfEnabled(name(), "after removing unnecessary copies",
+                                   *module);
 
   TF_RETURN_IF_ERROR(AddSpecialCaseCopies(*call_graph, module));
-
-  MaybeDumpModule("after adding special-case copies", *module);
+  DumpHloModuleDuringPassIfEnabled(name(), "after adding special-case copies",
+                                   *module);
 
   TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
   TF_RETURN_IF_ERROR(dce.Run(module).status());
   TF_DCHECK_OK(
       VerifyNoLiveRangeInterference(DependencyHloOrdering(module), module));
 
-  MaybeDumpModule("after copy insertion", *module);
-
   if (VLOG_IS_ON(1)) {
     int64 num_total_copies = 0;
     for (HloComputation* computation : module->computations()) {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 3cd93b1..1f077fb 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -95,6 +95,7 @@
         "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:scatter_expander",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index c58e307..d584557 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -70,6 +70,7 @@
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -436,46 +437,29 @@
     const LLVMCompiler::ModuleHook& user_post_optimization_hook,
     LLVMCompiler::ModuleHook* pre_optimization_ir_hook,
     LLVMCompiler::ModuleHook* post_optimization_ir_hook) {
-  const string& ir_dump_directory =
-      hlo_module.config().debug_options().xla_dump_ir_to();
-  if (ir_dump_directory.empty()) {
-    *pre_optimization_ir_hook = user_pre_optimization_hook;
-    *post_optimization_ir_hook = user_post_optimization_hook;
-    return Status::OK();
-  }
-
-  const string& hlo_module_name = hlo_module.name();
-
   // Create the IR hooks. If applicable, each IR hook does the following:
   //
   //  * Calls the user supplied module hook.
   //  * Writes out the IR to a file in the output directory designated by
-  //    --xla_dump_ir_to
-
-  *pre_optimization_ir_hook =
-      [user_pre_optimization_hook, ir_dump_directory,
-       hlo_module_name](const llvm::Module& llvm_module) {
-        if (user_pre_optimization_hook) {
-          TF_RETURN_IF_ERROR(user_pre_optimization_hook(llvm_module));
-        }
-        return llvm_ir::DumpIRToDirectory(/*directory_name=*/ir_dump_directory,
-                                          /*hlo_module_name=*/hlo_module_name,
-                                          llvm_module,
-                                          /*optimized=*/false);
-      };
-
-  *post_optimization_ir_hook =
-      [user_post_optimization_hook, ir_dump_directory,
-       hlo_module_name](const llvm::Module& llvm_module) {
-        if (user_post_optimization_hook) {
-          TF_RETURN_IF_ERROR(user_post_optimization_hook(llvm_module));
-        }
-        return llvm_ir::DumpIRToDirectory(/*directory_name=*/ir_dump_directory,
-                                          /*hlo_module_name=*/hlo_module_name,
-                                          llvm_module,
-                                          /*optimized=*/true);
-      };
-
+  //    --xla_dump_to
+  const HloModule* hlo_module_ptr = &hlo_module;
+  auto hook = [user_pre_optimization_hook, user_post_optimization_hook,
+               hlo_module_ptr](bool optimized,
+                               const llvm::Module& llvm_module) {
+    const auto& user_hook =
+        !optimized ? user_pre_optimization_hook : user_post_optimization_hook;
+    if (user_hook) {
+      TF_RETURN_IF_ERROR(user_hook(llvm_module));
+    }
+    llvm_ir::DumpIrIfEnabled(*hlo_module_ptr, llvm_module, optimized);
+    return Status::OK();
+  };
+  *pre_optimization_ir_hook = [hook](const llvm::Module& llvm_module) {
+    return hook(/*optimized=*/false, llvm_module);
+  };
+  *post_optimization_ir_hook = [hook](const llvm::Module& llvm_module) {
+    return hook(/*optimized=*/true, llvm_module);
+  };
   return Status::OK();
 }
 
@@ -490,7 +474,7 @@
       << "Invalid LLVM IR before optimizations:\n"
       << err_stream.str()
       << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
-         "Rerun with --xla_dump_ir_to to get the IR. ";
+         "Rerun with --xla_dump_to to get the IR. ";
   return Status::OK();
 }
 
@@ -534,9 +518,6 @@
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
     DeviceMemoryAllocator* /*device_allocator*/) {
-  VLOG(2) << "Before optimization:";
-  XLA_VLOG_LINES(2, module->ToString());
-
   std::unique_ptr<llvm::TargetMachine> jit_target_machine =
       SimpleOrcJIT::InferTargetMachineForJIT(
           CompilerTargetOptions(module->config()),
@@ -544,20 +525,16 @@
 
   TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false,
                                   jit_target_machine.get()));
-
-  VLOG(2) << "After optimization:";
-  XLA_VLOG_LINES(2, module->ToString());
   return std::move(module);
 }
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* /*device_allocator*/) {
-  const string timer_message =
-      "Compiling [" + module->name() + "] for CPU using JIT";
-  XLA_SCOPED_LOGGING_TIMER(timer_message);
-
   VLOG(1) << "Compiling: " << module->name();
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
+
   TF_RET_CHECK(stream_exec != nullptr);
   std::call_once(llvm_command_line_options_initialized,
                  &llvm_ir::InitializeLLVMCommandLineOptions, module->config());
@@ -600,8 +577,6 @@
   // ownership is std::moved.
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
-  const string xla_dump_optimized_hlo_proto_to =
-      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
 
   // Select an order for emitting the HLO instructions for each
   // computation. Using this sequence enables tighter buffer liveness analysis
@@ -620,13 +595,11 @@
                           /*allocate_buffers_for_constants=*/true));
   // BufferAssignment::ToString() includes a header, so no need for us to
   // print one ourselves.
-  XLA_VLOG_LINES(2, assignment->ToString());
-
-  if (!xla_dump_optimized_hlo_proto_to.empty()) {
-    HloProto proto = MakeHloProto(*module, *assignment);
-    TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-        proto, xla_dump_optimized_hlo_proto_to, module->name()));
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "buffer_assignment",
+                            assignment->ToString());
   }
+  DumpHloModuleIfEnabled(*module, *assignment, "after_optimizations");
 
   // Each computation is a single function.  Emit all embedded computations
   // before the entry computation. The order of computations returned from
@@ -681,7 +654,6 @@
     ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
   }
 
-  XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module));
   TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
   // JIT compile the LLVM IR module to in-memory machine code.
@@ -791,15 +763,9 @@
     HloModule* module = modules[i].get();
     VLOG(1) << "Compiling ahead-of-time: " << module->name();
 
-    VLOG(2) << "Before optimization:";
-    XLA_VLOG_LINES(2, module->ToString());
-
     TF_RETURN_IF_ERROR(
         RunHloPasses(module, /*is_aot_compile=*/true, target_machine.get()));
 
-    VLOG(2) << "After optimization:";
-    XLA_VLOG_LINES(2, module->ToString());
-
     TF_ASSIGN_OR_RETURN(HloSchedule schedule,
                         ScheduleModule(module, BufferSizeBytesFunction()));
 
@@ -814,15 +780,11 @@
                             /*allocate_buffers_for_constants=*/true));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
-    XLA_VLOG_LINES(2, assignment->ToString());
-
-    const string xla_dump_optimized_hlo_proto_to =
-        module->config().debug_options().xla_dump_optimized_hlo_proto_to();
-    if (!xla_dump_optimized_hlo_proto_to.empty()) {
-      HloProto proto = MakeHloProto(*module, *assignment);
-      TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_optimized_hlo_proto_to, module->name()));
+    if (DumpingEnabledForHloModule(*module)) {
+      DumpToFileInDirOrStdout(*module, "buffer_assignment",
+                              assignment->ToString());
     }
+    DumpHloModuleIfEnabled(*module, *assignment, "after_optimizations");
 
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
@@ -884,7 +846,6 @@
       TF_RETURN_IF_ERROR(verify_status);
     }
 
-    XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(llvm_module));
 
     Disassembler disassembler(*target_machine);
     CompilerFunctor compiler_functor(
diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
new file mode 100644
index 0000000..06d0456
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -0,0 +1,407 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dump.h"
+#include "absl/strings/ascii.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace xla {
+
+namespace {
+
+using absl::StrCat;
+using absl::StrFormat;
+using absl::string_view;
+
+struct CanonicalDebugOptions {
+  explicit CanonicalDebugOptions(const DebugOptions& opts)
+      : dump_to(opts.xla_dump_to()),
+        dump_as_text(opts.xla_dump_hlo_as_text()),
+        dump_as_proto(opts.xla_dump_hlo_as_proto()),
+        dump_as_dot(opts.xla_dump_hlo_as_dot()),
+        dump_as_html(opts.xla_dump_hlo_as_html()),
+        dump_as_url(opts.xla_dump_hlo_as_url()),
+        dump_snapshots(opts.xla_dump_hlo_snapshots()) {
+    // This constructor examines the values in `opts` and turns on other flags
+    // based on what we think is the user's intent.  To reduce confusion about
+    // what was a user-specified value versus an extrapolated value, within this
+    // function we treat this struct's members as write-only, and read only from
+    // `opts`.
+
+    // If dump_to is empty, default to dumping to stdout.
+    if (opts.xla_dump_to().empty()) {
+      dump_to = "-";
+    }
+
+    // Did the user specifiy an explicit format for dumping?
+    bool output_format_specified =
+        opts.xla_dump_hlo_as_text() || opts.xla_dump_hlo_as_proto() ||
+        opts.xla_dump_hlo_as_dot() || opts.xla_dump_hlo_as_html() ||
+        opts.xla_dump_hlo_as_url() || opts.xla_dump_hlo_snapshots();
+
+    // If we haven't specified an output format, default to dumping as text.
+    if (!output_format_specified) {
+      dump_as_text = true;
+    }
+
+    // If we specified a regular expression restricting which modules to dump,
+    // respect that.
+    //
+    // If we didn't specify which modules to dump but we passed some other flag
+    // which implies dumping modules, dump all modules.
+    //
+    // Otherwise, don't dump any HLO modules.
+    if (!opts.xla_dump_hlo_module_re().empty()) {
+      // RE2 object is not copyable, and we can't capture "by move", so we
+      // resort to this hack.
+      string pattern = opts.xla_dump_hlo_module_re();
+      should_dump_module = [pattern](string_view module_name) {
+        return RE2::PartialMatch(string(module_name), pattern);
+      };
+    } else if (!opts.xla_dump_hlo_pass_re().empty() ||
+               !opts.xla_dump_to().empty() || output_format_specified) {
+      should_dump_module = [](string_view) { return true; };
+    } else {
+      should_dump_module = [](string_view) { return false; };
+    }
+
+    // Initialize should_dump_pass.  This one is easy: We only dump per-pass
+    // data if the user asked for it explicitly.
+    if (!opts.xla_dump_hlo_pass_re().empty()) {
+      string pattern = opts.xla_dump_hlo_pass_re();
+      should_dump_pass = [pattern](string_view pass_name) {
+        return RE2::PartialMatch(string(pass_name), pattern);
+      };
+    } else {
+      should_dump_pass = [](string_view) { return false; };
+    }
+
+    // Output dirs "sponge" and "test_undeclared_outputs_dir" (case-insensitive)
+    // have a special meaning: Dump into the directory specified by the
+    // environment variable TEST_UNDECLARED_OUTPUTS_DIR.
+    string dump_to_lower = absl::AsciiStrToLower(opts.xla_dump_to());
+    if (dump_to_lower == "sponge" ||
+        dump_to_lower == "test_undeclared_outputs_dir") {
+      const char* dir = getenv("TEST_UNDECLARED_OUTPUTS_DIR");
+      if (dir != nullptr) {
+        dump_to = dir;
+      } else {
+        LOG(ERROR) << "--xla_dump_to=" << opts.xla_dump_to()
+                   << ", but environment variable TEST_UNDECLARED_OUTPUTS_DIR "
+                      "is not set, so cannot dump anywhere.";
+        should_dump_module = [](string_view) { return false; };
+        should_dump_pass = [](string_view) { return false; };
+      }
+    }
+  }
+
+  bool dumping_to_stdout() const { return dump_to == "-"; }
+
+  string dump_to;
+  std::function<bool(string_view module_name)> should_dump_module;
+  std::function<bool(string_view pass_name)> should_dump_pass;
+
+  // dump_ir isn't present here because this file is mostly concerned with
+  // dumping HLO.
+  bool dump_as_text;
+  bool dump_as_proto;
+  bool dump_as_dot;
+  bool dump_as_html;
+  bool dump_as_url;
+  bool dump_snapshots;
+};
+
+string FilenameFor(const HloModule& module, string_view suffix) {
+  return StrFormat("module_%04d.%s", module.unique_id(), suffix);
+}
+
+void DumpToFileInDirImpl(string_view filename, string_view contents,
+                         const CanonicalDebugOptions& opts) {
+  if (opts.dumping_to_stdout()) {
+    LOG(ERROR) << "Refusing to write " << filename
+               << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
+    return;
+  }
+
+  const string& dir = opts.dump_to;
+  VLOG(1) << "Dumping " << filename << " to " << dir;
+
+  tensorflow::Env* env = tensorflow::Env::Default();
+  // Two threads can race to observe the absence of the dump directory and
+  // simultaneously try to create it, causing the "losing" thread to get a
+  // "directory already exists" error.  We can work around this by checking
+  // again whether the dir exists.
+  if (!env->IsDirectory(dir).ok()) {
+    auto status = env->RecursivelyCreateDir(dir);
+    if (!status.ok() && !env->IsDirectory(dir).ok()) {
+      LOG(ERROR) << "Could not create directory " << dir
+                 << " for dumping XLA debug data: " << status;
+      return;
+    }
+  }
+
+  string file_path =
+      tensorflow::io::JoinPath(dir, SanitizeFileName(string(filename)));
+  auto status = tensorflow::WriteStringToFile(env, file_path, contents);
+  if (!status.ok()) {
+    LOG(ERROR) << "Could not write XLA debug data to " << file_path << ": "
+               << status;
+  }
+}
+
+void DumpToFileInDirOrStdoutImpl(string_view filename, string_view contents,
+                                 const CanonicalDebugOptions& opts) {
+  // Dump to stdout if that's called for.
+  if (opts.dumping_to_stdout()) {
+    std::cout << "*** Begin " << filename << " ***\n"
+              << contents << "\n*** End " << filename << " ***" << std::endl;
+    return;
+  }
+
+  // Otherwise, dump to a file.
+  DumpToFileInDirImpl(filename, contents, opts);
+}
+
+void DumpHloModuleImpl(const HloModule& module,
+                       const BufferAssignment* buffer_assn,
+                       const HloExecutionProfile* profile, string_view suffix,
+                       const CanonicalDebugOptions& opts) {
+  string filename = FilenameFor(module, suffix);
+
+  if (opts.dump_as_text) {
+    DumpToFileInDirOrStdoutImpl(StrCat(filename, ".txt"), module.ToString(),
+                                opts);
+  }
+
+  if (opts.dump_as_proto) {
+    HloProto module_proto =
+        buffer_assn ? MakeHloProto(module, *buffer_assn) : MakeHloProto(module);
+    string pb;
+    if (!tensorflow::SerializeToStringDeterministic(module_proto, &pb)) {
+      pb = "Failed to serialize HLO module proto.";
+    }
+    DumpToFileInDirImpl(StrCat(filename, ".hlo.pb"), pb, opts);
+  }
+
+  auto render_graph = [&](RenderedGraphFormat format) {
+    StatusOr<string> rendered_graph = RenderGraph(
+        *module.entry_computation(),
+        /*label=*/filename, module.config().debug_options(), format, profile);
+    if (rendered_graph.ok()) {
+      return std::move(rendered_graph).ValueOrDie();
+    }
+    return StrFormat("Error rendering graph: %s",
+                     rendered_graph.status().ToString());
+  };
+
+  if (opts.dump_as_dot) {
+    DumpToFileInDirImpl(StrFormat("%s.dot", filename),
+                        render_graph(RenderedGraphFormat::kDot), opts);
+  }
+
+  if (opts.dump_as_html) {
+    DumpToFileInDirImpl(StrFormat("%s.html", filename),
+                        render_graph(RenderedGraphFormat::kHtml), opts);
+  }
+
+  // Special case for rendering graphs as URLs.  We'll dump them to a file
+  // because why not, but we always log them to stdout as well.
+  if (opts.dump_as_url) {
+    string url = render_graph(RenderedGraphFormat::kUrl);
+    std::cout << filename << " --> " << url << std::endl;
+    if (!opts.dumping_to_stdout()) {
+      DumpToFileInDirImpl(StrFormat("%s.url", filename), url, opts);
+    }
+  }
+}
+
+static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+
+// Maps a module's unique ID to a counter indicating how many times we've dumped
+// this module during the compilation pipeline.  This lets us keep the filenames
+// ordered nicely.
+//
+// Entries added here leak forever; we have no way to GC them when a module
+// dies.  But we only add an entry if dumping is enabled for this module, and
+// dumping a module leaks buffer space in stdout or bytes on disk *way* faster
+// than this hashtable leaks memory.
+static auto& module_id_to_step_number GUARDED_BY(mu) =
+    *new absl::flat_hash_map<int64, int64>();
+
+}  // namespace
+
+void DumpToFileInDir(const HloModule& module, string_view suffix,
+                     string_view contents) {
+  DumpToFileInDirImpl(FilenameFor(module, suffix), contents,
+                      CanonicalDebugOptions(module.config().debug_options()));
+}
+
+void DumpToFileInDirOrStdout(const HloModule& module, string_view suffix,
+                             string_view contents) {
+  DumpToFileInDirOrStdoutImpl(
+      FilenameFor(module, suffix), contents,
+      CanonicalDebugOptions(module.config().debug_options()));
+}
+
+void DumpHloModuleIfEnabled(const HloModule& module, string_view name) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (opts.should_dump_module(module.name())) {
+    DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, /*profile=*/nullptr,
+                      name, opts);
+  }
+}
+void DumpHloModuleIfEnabled(const HloModule& module,
+                            const BufferAssignment& buffer_assn,
+                            string_view name) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (opts.should_dump_module(module.name())) {
+    DumpHloModuleImpl(module, &buffer_assn, /*profile=*/nullptr, name, opts);
+  }
+}
+
+void DumpHloModuleIfEnabled(const HloModule& module,
+                            const HloExecutionProfile& profile,
+                            string_view name) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (opts.should_dump_module(module.name())) {
+    DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, &profile, name, opts);
+  }
+}
+
+bool DumpingEnabledForHloModule(string_view hlo_module_name,
+                                const DebugOptions& opts) {
+  return CanonicalDebugOptions(opts).should_dump_module(hlo_module_name);
+}
+
+bool DumpingToStdout(const DebugOptions& opts) {
+  return CanonicalDebugOptions(opts).dumping_to_stdout();
+}
+
+void DumpHloModuleBetweenPassesIfEnabled(string_view pipeline_name,
+                                         string_view before_pass_name,
+                                         string_view after_pass_name,
+                                         const HloModule& module) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (!opts.should_dump_module(module.name())) {
+    return;
+  }
+
+  if (!opts.should_dump_pass(before_pass_name) &&
+      !opts.should_dump_pass(after_pass_name)) {
+    return;
+  }
+
+  int64 step_number;
+  {
+    tensorflow::mutex_lock lock(mu);
+    step_number = module_id_to_step_number[module.unique_id()]++;
+  }
+
+  string filename_suffix =
+      StrFormat("%04d.%s.after_%s.before_%s", step_number, pipeline_name,
+                after_pass_name, before_pass_name);
+  DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, /*profile=*/nullptr,
+                    filename_suffix, opts);
+}
+
+void DumpHloModuleDuringPassIfEnabled(string_view pass_name,
+                                      string_view step_name,
+                                      const HloModule& module) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (!opts.should_dump_module(module.name()) ||
+      !opts.should_dump_pass(pass_name)) {
+    return;
+  }
+
+  int64 step_number;
+  {
+    tensorflow::mutex_lock lock(mu);
+    step_number = module_id_to_step_number[module.unique_id()]++;
+  }
+
+  string filename_suffix =
+      StrFormat("%04d.%s.%s", step_number, pass_name, step_name);
+  DumpHloModuleImpl(module, /*buffer_assn=*/nullptr, /*profile=*/nullptr,
+                    filename_suffix, opts);
+}
+
+void DumpHloSnapshotIfEnabled(const HloModule& module,
+                              const HloSnapshot& snapshot) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (!opts.should_dump_module(module.name()) || !opts.dump_snapshots) {
+    return;
+  }
+  int64 execution_count;
+  {
+    static auto& module_id_to_execution_count GUARDED_BY(mu) =
+        *new absl::flat_hash_map<int64, int64>();
+    tensorflow::mutex_lock lock(mu);
+    execution_count = module_id_to_execution_count[module.unique_id()]++;
+  }
+  string filename =
+      StrCat(FilenameFor(module, StrFormat("execution_%04d", execution_count)),
+             ".hlo_snapshot.pb");
+  if (opts.dumping_to_stdout()) {
+    LOG(ERROR) << "Refusing to write HLO snapshot proto for " << filename
+               << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
+    return;
+  }
+  string pb;
+  if (!tensorflow::SerializeToStringDeterministic(snapshot, &pb)) {
+    LOG(ERROR) << "Failed to serialize HLO snapshot proto " << filename;
+  }
+  DumpToFileInDirImpl(filename, pb, opts);
+}
+
+void DumpHloSnapshotIfEnabled(const HloSnapshot& snapshot,
+                              const DebugOptions& opts) {
+  CanonicalDebugOptions canonical_opts(opts);
+  string name = snapshot.hlo().hlo_module().name();
+  if (!canonical_opts.should_dump_module(name) ||
+      !canonical_opts.dump_snapshots) {
+    return;
+  }
+
+  // We don't have a unique id for an HloSnapshot, so in this overload we just
+  // have to use its name.
+  int64 execution_count;
+  {
+    static auto& module_name_to_execution_count GUARDED_BY(mu) =
+        *new absl::flat_hash_map<string, int64>();
+    tensorflow::mutex_lock lock(mu);
+    execution_count = module_name_to_execution_count[name]++;
+  }
+  string filename = StrFormat("module_%s.execution_%04d.hlo_snapshot.pb", name,
+                              execution_count);
+  if (canonical_opts.dumping_to_stdout()) {
+    LOG(ERROR) << "Refusing to write HLO snapshot proto for " << filename
+               << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
+    return;
+  }
+  string pb;
+  if (!tensorflow::SerializeToStringDeterministic(snapshot, &pb)) {
+    LOG(ERROR) << "Failed to serialize HLO snapshot proto " << filename;
+  }
+  DumpToFileInDirImpl(filename, pb, canonical_opts);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dump.h b/tensorflow/compiler/xla/service/dump.h
new file mode 100644
index 0000000..6edc9b2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dump.h
@@ -0,0 +1,109 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DUMP_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DUMP_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+// Consolidated utilities for logging information during compilation, usually
+// based on the options specified in the DebugOptions proto.
+//
+// Most functions here take an HloModule and read the DebugOptions from the
+// module's config.
+
+namespace xla {
+
+class BufferAssignment;
+class HloExecutionProfile;
+class HloSnapshot;
+
+// Writes the given string to a file in the xla_dump_to directory specified by
+// module's DebugOptions.
+//
+// If module doesn't have an xla_dump_to directory, does nothing.
+void DumpToFileInDir(const HloModule& module, absl::string_view file_suffix,
+                     absl::string_view contents);
+
+// Like DumpToFileInDir, except if module doesn't have an xla_dump_to directory
+// specified, or if that directory is equal to "-", writes to stdout instead.
+void DumpToFileInDirOrStdout(const HloModule& module,
+                             absl::string_view file_suffix,
+                             absl::string_view contents);
+
+// Dumps the given HLO module if dumping is enabled for the module.  Exactly
+// where and in what formats it's dumped is determined by the module's config.
+//
+// If you pass an HloExecutionProfile, note that currently only DOT-based output
+// formats (i.e. --xla_dump_as_{dot,html,url}) are able to incorporate it into
+// their output.  Other formats will just ignore the profile.
+void DumpHloModuleIfEnabled(const HloModule& module, absl::string_view name);
+void DumpHloModuleIfEnabled(const HloModule& module,
+                            const BufferAssignment& buffer_assn,
+                            absl::string_view name);
+void DumpHloModuleIfEnabled(const HloModule& module,
+                            const HloExecutionProfile& profile,
+                            absl::string_view name);
+
+// Dumps the given HLO module after running one HLO pass and before running
+// another, if that's enabled.
+void DumpHloModuleBetweenPassesIfEnabled(absl::string_view pipeline_name,
+                                         absl::string_view before_pass_name,
+                                         absl::string_view after_pass_name,
+                                         const HloModule& module);
+
+// Dumps the given HLO module during the given HLO pass, if that's enabled.
+//
+// "step" is a human-readable description of where we are in the middle of this
+// pass.  For example, "before-assigning-layouts".
+void DumpHloModuleDuringPassIfEnabled(absl::string_view pass_name,
+                                      absl::string_view step,
+                                      const HloModule& module);
+
+// Dumps the given HloSnapshot to the module's xla_dump_dir, if this is enabled.
+//
+// Prefer the first overload below, as this will give filenames that are
+// consistent with the other methods here.  The second overload (which doesn't
+// take an HloModule) is useful in the cases when you're dumping an HloSnapshot
+// and simply don't have an HloModule.
+void DumpHloSnapshotIfEnabled(const HloModule& module,
+                              const HloSnapshot& snapshot);
+void DumpHloSnapshotIfEnabled(const HloSnapshot& snapshot,
+                              const DebugOptions& opts);
+
+// Returns true if we should dump data for an HloModule.  This is useful if you
+// want to check if DumpToFileInDir{,OrStdout} will do anything before
+// generating an expensive string.
+bool DumpingEnabledForHloModule(absl::string_view hlo_module_name,
+                                const DebugOptions& opts);
+inline bool DumpingEnabledForHloModule(const HloModule& module) {
+  return DumpingEnabledForHloModule(module.name(),
+                                    module.config().debug_options());
+}
+
+// Returns true if DumpToFileInDirOrStdout and DumpHloModuleIfEnabled will write
+// to stdout, rather than to a file on disk.
+//
+// This is useful if you want to do something different when writing to stdout.
+// For example, maybe you have (almost-)duplicate data that you wouldn't mind
+// writing to two files, but you don't want to print twice.
+bool DumpingToStdout(const DebugOptions& opts);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DUMP_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index a496342..5f7d8a7 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -19,19 +19,10 @@
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 
 namespace xla {
 
-namespace {
-bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
-  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
-         window_dimension.padding_low() == 0 &&
-         window_dimension.padding_high() == 0 &&
-         window_dimension.window_dilation() == 1 &&
-         window_dimension.base_dilation() == 1;
-}
-}  // namespace
-
 class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
  public:
   explicit DynamicDimensionInferenceVisitor(
@@ -366,7 +357,7 @@
         const WindowDimension& window_dimension =
             reduce_window->window().dimensions(dimension);
 
-        if (!IsTrivialWindowDimension(window_dimension)) {
+        if (!window_util::IsTrivialWindowDimension(window_dimension)) {
           return Unimplemented(
               "Dynamic Spatial reduce window is not supported: %s",
               reduce_window->ToString());
@@ -387,7 +378,7 @@
         const WindowDimension& window_dimension =
             select_and_scatter->window().dimensions(dimension);
 
-        if (!IsTrivialWindowDimension(window_dimension)) {
+        if (!window_util::IsTrivialWindowDimension(window_dimension)) {
           return Unimplemented(
               "Dynamic Spatial select and scatter is not supported: %s",
               select_and_scatter->ToString());
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index 2cd2ef2..36456e5 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -43,7 +43,6 @@
   }
 
   Status RunInference() {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
     TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
                         DynamicDimensionInference::Run(module_.get()));
 
@@ -88,6 +87,8 @@
       HloInstruction::CreateParameter(1, scalar_shape_, "param"));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
   // Set up dynamic parameter binding.
   TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
       DynamicParameterBinding::DynamicParameter{1, {}},
@@ -112,6 +113,7 @@
       DynamicParameterBinding::DynamicParameter{0, {1}},
       DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
               op::GetTupleElement(param, 1));
@@ -137,6 +139,7 @@
       DynamicParameterBinding::DynamicParameter{0, {1}},
       DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
               op::GetTupleElement(param, 1));
@@ -167,6 +170,7 @@
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(negate, {}, 1), size_param);
 }
@@ -197,6 +201,7 @@
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), size_param);
 }
@@ -228,6 +233,7 @@
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 2}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 1), size_param);
   EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), nullptr);
@@ -271,6 +277,7 @@
       DynamicParameterBinding::DynamicParameter{2, {}},
       DynamicParameterBinding::DynamicDimension{1, {}, 0}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 0), size_param);
   EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 1), nullptr);
@@ -319,6 +326,7 @@
       DynamicParameterBinding::DynamicParameter{2, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 1), size_param);
   EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 0), nullptr);
@@ -356,6 +364,7 @@
       DynamicParameterBinding::DynamicParameter{3, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 2}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 0), size_param_3);
   EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 1), size_param_2);
@@ -386,6 +395,7 @@
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 3}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 0), nullptr);
   EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 1), size_param);
@@ -415,6 +425,7 @@
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 1}));
 
+  SCOPED_TRACE(module_->ToString());
   Status status = RunInference();
   EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
 }
@@ -439,6 +450,7 @@
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 0}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 0), nullptr);
   EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 1), size_param);
@@ -580,6 +592,7 @@
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 0}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
 }
@@ -633,6 +646,7 @@
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{2, {}, 0}));
 
+  SCOPED_TRACE(module_->ToString());
   TF_ASSERT_OK(RunInference());
   EXPECT_EQ(inference_->GetDynamicSize(sns, {}, 0), size_param);
 }
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
index fda806b..2963dea 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -43,10 +43,7 @@
   DynamicPadderTest() : HloTestBase() { module_ = CreateNewVerifiedModule(); }
 
   StatusOr<bool> RunPadder() {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before padder");
-
     DynamicPadder padder;
-
     return padder.Run(module_.get());
   }
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 1518d83..7b60c98 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -18,6 +18,7 @@
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -137,8 +138,6 @@
     XLA_LOG_LINES(
         tensorflow::INFO,
         profile_ptr->ToString(stream->parent()->GetDeviceDescription()));
-    hlo_graph_dumper::MaybeDumpHloModule(module(), "Service::Execute",
-                                         profile_ptr.get());
   }
 
   return return_value;
@@ -146,39 +145,4 @@
 
 int64 Executable::SizeInBytes() { return -1; }
 
-Status Executable::DumpHloSnapshot() {
-  TF_RET_CHECK(dumping_snapshot());
-  TF_RET_CHECK(hlo_snapshot_->has_hlo() &&
-               hlo_snapshot_->hlo().has_hlo_module());
-  const string& directory_path =
-      module_config().debug_options().xla_dump_executions_to();
-  const auto& module = hlo_snapshot_->hlo().hlo_module();
-  string filename =
-      absl::StrFormat("computation_%d__%s__execution_%d", module.id(),
-                      module.entry_computation_name(), ++execution_count_);
-  return Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot_);
-}
-
-/* static */ Status Executable::DumpToDirectory(
-    const string& directory_path, string filename,
-    const HloSnapshot& hlo_session) {
-  tensorflow::Env* env = tensorflow::Env::Default();
-  if (!env->IsDirectory(directory_path).ok()) {
-    // NB! CreateDir does not work reliably with multiple XLA threads -- two
-    // threads can race to observe the absence of the dump directory and
-    // simultaneously try to create it, causing the "losing" thread to get a
-    // "directory already exists" error.
-    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path));
-  }
-  filename = SanitizeFileName(std::move(filename));
-  string file_path = tensorflow::io::JoinPath(directory_path, filename);
-  const size_t size = hlo_session.ByteSizeLong();
-  auto serialized = absl::make_unique<char[]>(size);
-  TF_RET_CHECK(tensorflow::SerializeToBufferDeterministic(
-      hlo_session, serialized.get(), size));
-  return tensorflow::WriteStringToFile(
-      tensorflow::Env::Default(), file_path,
-      absl::string_view(serialized.get(), size));
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index b34bca5..a08ec18 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -184,11 +184,6 @@
   }
   bool dumping_snapshot() const { return hlo_snapshot_ != nullptr; }
   HloSnapshot* hlo_snapshot() const { return hlo_snapshot_.get(); }
-  Status DumpHloSnapshot();
-
-  // Dump hlo snapshot to directory_path/filename.
-  static Status DumpToDirectory(const string& directory_path, string filename,
-                                const HloSnapshot& hlo_session);
 
  protected:
   mutable tensorflow::mutex mutex_;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 2a65d54..8c761df 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -841,6 +841,7 @@
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index ad75e2d..17b7c78 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -38,6 +38,7 @@
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/convolution_group_converter.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
@@ -618,9 +619,6 @@
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* device_allocator) {
   // We dump the post-optimization HLO in RunBackend so no need to dump it here.
-  VLOG(3) << "*** HLO Before Optimization";
-  XLA_VLOG_LINES(3, module->ToString());
-
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
   tracing::ScopedActivity activity("HLO Transforms", module->name(),
                                    /*is_expensive=*/true);
@@ -674,19 +672,11 @@
           [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
           /*allow_input_output_aliasing=*/false,
           /*allocate_buffers_for_constants=*/true));
-  // BufferAssignment::Stats::ToString() and BufferAssignment::ToString()
-  // include headers, so no need for us to print them ourselves.
-  XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
-  XLA_VLOG_LINES(2, buffer_assignment->ToString());
-  VLOG(3) << "*** HLO After Optimization";
-  XLA_VLOG_LINES(3, module->ToString());
-  const string xla_dump_optimized_hlo_proto_to =
-      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
-  if (!xla_dump_optimized_hlo_proto_to.empty()) {
-    HloProto proto = MakeHloProto(*module, *buffer_assignment);
-    TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-        proto, xla_dump_optimized_hlo_proto_to, module->name()));
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "buffer_assignment",
+                            buffer_assignment->ToString());
   }
+  DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
 
   IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
                                       &stream_exec->GetDeviceDescription(),
@@ -709,21 +699,11 @@
   string ir_module_string_before_opt;
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
-  if (VLOG_IS_ON(3) || embed_ir_in_executable) {
+  if (embed_ir_in_executable) {
     ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module);
-    VLOG(3) << "LLVM module before optimizations:";
-    XLA_VLOG_LINES(3, ir_module_string_before_opt);
   }
 
-  const string& ir_dump_directory =
-      module->config().debug_options().xla_dump_ir_to();
-
-  if (!ir_dump_directory.empty()) {
-    TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory(
-        /*directory_name=*/ir_dump_directory,
-        /*hlo_module_name=*/module->name(), llvm_module,
-        /*optimized=*/false));
-  }
+  llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/false);
 
   {
     XLA_SCOPED_LOGGING_TIMER(
@@ -737,7 +717,7 @@
         << "Invalid LLVM IR before optimizations:\n"
         << err_stream.str()
         << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
-           "Rerun with --xla_dump_ir_to to get the IR. ";
+           "Rerun with --xla_dump_to to get the IR. ";
   }
 
   string libdevice_dir;
@@ -770,35 +750,14 @@
                                           module->config(), libdevice_dir));
   }
 
-  if (!ir_dump_directory.empty()) {
-    TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory(
-        /*directory_name=*/ir_dump_directory,
-        /*hlo_module_name=*/module->name(), llvm_module,
-        /*optimized=*/true));
-  }
+  llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/true);
 
   if (user_post_optimization_hook_) {
     TF_CHECK_OK(user_post_optimization_hook_(llvm_module));
   }
-  VLOG(3) << "LLVM module after optimizations:";
-  XLA_VLOG_LINES(3, llvm_ir::DumpModuleToString(llvm_module));
-  VLOG(3) << "PTX:";
-  XLA_VLOG_LINES(3, ptx);
-
   // Write PTX to IR dump directory, if IR dumping was requested.
-  if (!ir_dump_directory.empty()) {
-    const string ptx_outfile = tensorflow::io::JoinPath(
-        ir_dump_directory, absl::StrCat(module->name(), ".ptx"));
-    auto status = [&] {
-      auto* env = tensorflow::Env::Default();
-      TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(ir_dump_directory));
-      TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_outfile, ptx));
-      return Status::OK();
-    }();
-    if (!status.ok()) {
-      LOG(WARNING) << "Couldn't dump PTX for module " << module->name()
-                   << " to " << ptx_outfile << ": " << status;
-    }
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "ptx", ptx);
   }
 
   const std::vector<uint8> cubin =
@@ -807,8 +766,10 @@
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
       hlo_schedule->ThunkLaunchOrder());
-  VLOG(3) << "Printing the thunk schedule...";
-  XLA_VLOG_LINES(3, thunk_schedule->ToString());
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "thunk_schedule",
+                            thunk_schedule->ToString());
+  }
 
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
   std::unique_ptr<HloProfilePrinterData> profile_printer;
diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc
deleted file mode 100644
index ef70b68..0000000
--- a/tensorflow/compiler/xla/service/graphviz_example.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Example HLO graph which demonstrates Graphviz dumper for HLO
-// computations. When run, pushes the example DOT graph to the Graphviz service
-// and prints the URL. Useful for seeing effect of changes to the graph
-// generation code.
-
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-#include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace {
-
-// Adds a computation to the given HLO module which adds a scalar constant to
-// its parameter and returns the result.
-HloComputation* AddScalarConstantComputation(int64 addend, HloModule* module) {
-  auto builder = HloComputation::Builder(absl::StrCat("add_", addend));
-  auto x_value = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {}), "x_value"));
-  auto half = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.5)));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      half->shape(), HloOpcode::kAdd, x_value, half));
-  return module->AddEmbeddedComputation(builder.Build());
-}
-
-// Adds a computation to the given HLO module which sums its two parameters and
-// returns the result.
-HloComputation* ScalarSumComputation(HloModule* module) {
-  auto builder = HloComputation::Builder("add");
-  auto lhs = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "lhs"));
-  auto rhs = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "rhs"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
-  return module->AddEmbeddedComputation(builder.Build());
-}
-
-// Adds a computation to the given HLO module which forwards its argument to a
-// kCall instruction which then calls the given computation.
-HloComputation* CallForwardingComputation(HloComputation* computation,
-                                          HloModule* module) {
-  auto builder = HloComputation::Builder("call_forward");
-  auto arg = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "arg"));
-  builder.AddInstruction(
-      HloInstruction::CreateCall(arg->shape(), {arg}, computation));
-  return module->AddEmbeddedComputation(builder.Build());
-}
-
-// Create a large, arbitrary computation with many different kinds of
-// instructions. Sets the computation as the entry to an HLO module and returns
-// the module.
-std::unique_ptr<HloModule> MakeBigGraph() {
-  HloModuleConfig config;
-  auto module = absl::make_unique<HloModule>("BigGraph", config);
-
-  auto builder = HloComputation::Builder("TestBigGraphvizGraph");
-
-  // Shapes used in the computation.
-  auto mshape = ShapeUtil::MakeShape(F32, {3, 5});
-  auto vshape = ShapeUtil::MakeShape(F32, {3});
-  auto sshape = ShapeUtil::MakeShape(F32, {3});
-
-  // Create a set of parameter instructions.
-  auto param_v0 =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, vshape, "foo"));
-  auto param_v1 =
-      builder.AddInstruction(HloInstruction::CreateParameter(1, vshape, "bar"));
-  auto param_v2 =
-      builder.AddInstruction(HloInstruction::CreateParameter(2, vshape, "baz"));
-  auto param_s =
-      builder.AddInstruction(HloInstruction::CreateParameter(3, sshape, "qux"));
-  auto param_m =
-      builder.AddInstruction(HloInstruction::CreateParameter(4, mshape, "zzz"));
-
-  // Add an arbitrary expression of different instructions.
-  auto copy = builder.AddInstruction(
-      HloInstruction::CreateUnary(vshape, HloOpcode::kCopy, param_v0));
-  auto clamp = builder.AddInstruction(HloInstruction::CreateTernary(
-      vshape, HloOpcode::kClamp, copy, param_v1, param_v2));
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  PrecisionConfig precision_config;
-  precision_config.mutable_operand_precision()->Resize(
-      /*new_size=*/2, PrecisionConfig::DEFAULT);
-  auto dot = builder.AddInstruction(HloInstruction::CreateDot(
-      vshape, clamp, param_v0, dot_dnums, precision_config));
-  auto tuple = builder.AddInstruction(
-      HloInstruction::CreateTuple({dot, param_s, clamp}));
-  auto scalar = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(sshape, tuple, 2));
-  auto add_one = AddScalarConstantComputation(1.0, module.get());
-  auto rng = builder.AddInstruction(
-      HloInstruction::CreateRng(vshape, RNG_UNIFORM, {param_m, param_m}));
-  auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto add_computation = ScalarSumComputation(module.get());
-  builder.AddInstruction(
-      HloInstruction::CreateReduce(vshape, rng, one, {1}, add_computation));
-  auto map1 = builder.AddInstruction(
-      HloInstruction::CreateMap(sshape, {scalar}, add_one));
-  auto map2 = builder.AddInstruction(
-      HloInstruction::CreateMap(sshape, {map1}, add_one));
-  auto map3 = builder.AddInstruction(
-      HloInstruction::CreateMap(sshape, {map2}, add_one));
-
-  // Create a fusion instruction containing the chain of map instructions.
-  auto fusion = builder.AddInstruction(HloInstruction::CreateFusion(
-      sshape, HloInstruction::FusionKind::kLoop, map3));
-  fusion->FuseInstruction(map2);
-  fusion->FuseInstruction(map1);
-
-  // Add a random trace instruction.
-  builder.AddInstruction(HloInstruction::CreateTrace("trace", dot));
-
-  // Add a call instruction will calls the call-forwarding computation to call
-  // another computation.
-  auto call_computation = CallForwardingComputation(add_one, module.get());
-  builder.AddInstruction(
-      HloInstruction::CreateCall(fusion->shape(), {fusion}, call_computation));
-
-  module->AddEntryComputation(builder.Build());
-  return module;
-}
-
-}  // namespace
-}  // namespace xla
-
-int main(int argc, char** argv) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  auto module = xla::MakeBigGraph();
-
-  printf("Graph URL: %s\n", xla::hlo_graph_dumper::DumpGraph(
-                                *module->entry_computation(),
-                                "Example computation", xla::DebugOptions())
-                                .c_str());
-  return 0;
-}
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index b6dbf07..e344fbc 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -20,7 +20,6 @@
 
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
@@ -48,7 +47,6 @@
   // Run alias analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
   HloAliasAnalysis& RunAnalysis() {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
     analysis_ = HloAliasAnalysis::Run(module_.get(),
                                       /*fusion_can_share_buffer=*/nullptr)
                     .ConsumeValueOrDie();
@@ -126,6 +124,7 @@
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, constant1, constant2));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -160,6 +159,7 @@
   builder.AddInstruction(
       HloInstruction::CreateBinary(scalar_shape_, HloOpcode::kAdd, gte0, gte1));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -203,6 +203,7 @@
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({param0, param1, param0}));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -237,6 +238,8 @@
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
       /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
       /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
@@ -281,6 +284,8 @@
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
       /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1},
       /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
@@ -370,6 +375,8 @@
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({negate_1, negate_2}));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
       /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
       /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
@@ -421,6 +428,7 @@
   auto call = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -462,6 +470,7 @@
   auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {call1, constant2}, called_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -547,6 +556,7 @@
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -647,6 +657,7 @@
 
   FlattenCallGraph flattener;
   TF_ASSERT_OK(flattener.Run(module_.get()).status());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -738,6 +749,7 @@
   auto entry_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition2, outer_body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -811,6 +823,7 @@
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -872,6 +885,7 @@
       tuple_shape, HloOpcode::kTupleSelect, pred, select12, select34));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -960,6 +974,7 @@
       HloInstruction::CreateWhile(tuple_shape, condition, body, select));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -997,6 +1012,7 @@
       scalar_shape_, HloOpcode::kBitcast, constant));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -1017,6 +1033,7 @@
   builder.AddInstruction(HloInstruction::CreateTuple({constant, bitcast}));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -1056,6 +1073,7 @@
       builder.AddInstruction(HloInstruction::CreateTuple({negate, xla_while}));
 
   HloComputation* entry = module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 12fbcdb..f0b18d6 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -50,7 +50,6 @@
   // reference to the generated analysis stored in analysis_.
   const HloDataflowAnalysis& RunAnalysis(bool ssa_form,
                                          bool bitcast_defines_value = false) {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before dataflow analysis");
     analysis_ =
         HloDataflowAnalysis::Run(*module_, ssa_form, bitcast_defines_value)
             .ConsumeValueOrDie();
@@ -109,6 +108,7 @@
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, constant1, constant2));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -157,6 +157,7 @@
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(scalar_shape_, HloOpcode::kAdd, gte0, gte1));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -212,6 +213,7 @@
   auto gte_out = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape_, gte_tuple, 0));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -267,6 +269,7 @@
   auto call = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, called_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -320,6 +323,7 @@
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kSubtract, call1, call2));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -372,6 +376,7 @@
   auto call2 = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {call1, constant2}, called_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -434,6 +439,7 @@
   auto call = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, outer_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -509,6 +515,7 @@
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -614,6 +621,7 @@
   auto xla_while2 = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, xla_while1));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -701,6 +709,7 @@
   auto entry_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, outer_body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -796,6 +805,7 @@
   auto xla_while = builder.AddInstruction(
       HloInstruction::CreateWhile(tuple_shape, condition, body, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -851,6 +861,7 @@
       scalar_shape_, HloOpcode::kSelect, pred, constant1, constant2));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -893,6 +904,7 @@
       tuple_shape, HloOpcode::kTupleSelect, pred, select12, select34));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -964,6 +976,7 @@
       tuple1->shape(), HloOpcode::kTupleSelect, pred, tuple1, tuple2));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1053,6 +1066,7 @@
       HloInstruction::CreateWhile(tuple->shape(), condition, body, tuple));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1095,6 +1109,7 @@
       scalar_shape_, HloOpcode::kBitcast, constant));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   {
@@ -1131,6 +1146,7 @@
   auto copy = builder.AddInstruction(
       HloInstruction::CreateUnary(tuple->shape(), HloOpcode::kCopy, tuple));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1164,6 +1180,7 @@
       HloInstruction::CreateSend(param, token, /*channel_id=*/0));
   auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1189,6 +1206,7 @@
       HloInstruction::CreateRecv(scalar_shape_, token, /*channel_id=*/0));
   auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
@@ -1224,6 +1242,7 @@
       HloInstruction::CreateUnary(vector_shape_, HloOpcode::kLog, exp));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   DependencyHloOrdering ordering(module_.get());
@@ -1261,6 +1280,7 @@
       vector_shape_, HloOpcode::kAdd, negate, exp));
 
   auto entry = module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   HloSchedule schedule(module_.get());
@@ -1339,6 +1359,7 @@
       HloInstruction::CreateWhile(scalar_shape_, condition, body, param));
 
   auto entry = module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   bool ssa_form = GetParam();
   RunAnalysis(ssa_form);
 
@@ -1409,6 +1430,7 @@
       HloInstruction::CreateReverse(vector_shape_, negate, {0}));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   DependencyHloOrdering ordering(module_.get());
@@ -1440,6 +1462,7 @@
       vector_shape_, HloOpcode::kAdd, negate, exp));
 
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   DependencyHloOrdering ordering(module_.get());
@@ -1479,6 +1502,7 @@
       vector_shape_, HloOpcode::kAdd, negate, exp));
 
   auto entry = module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   HloSchedule schedule(module_.get());
@@ -1537,6 +1561,7 @@
   builder.AddInstruction(HloInstruction::CreateBinary(
       vector_shape_, HloOpcode::kAdd, negate, call));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
   RunAnalysis(GetParam());
 
   DependencyHloOrdering ordering(module_.get());
@@ -1589,6 +1614,7 @@
       scalar_shape_, pred, constant1, true_computation, constant2,
       false_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
 
@@ -1682,6 +1708,7 @@
       scalar_shape_, pred, tuple_operand, true_computation, tuple_operand,
       false_computation));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
 
@@ -1816,6 +1843,7 @@
       scalar_shape_, pred1, tuple_operand, inner_conditional_computation,
       constant3, computation3));
   module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
 
   const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
index 19b5734..3746fbb 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
@@ -27,8 +27,6 @@
 
 StatusOr<bool> RunInternal(HloModule* module,
                            HloDomainIsolator::DomainCreator* creator) {
-  hlo_graph_dumper::MaybeDumpHloModule(*module, "Before Domain Isolator");
-
   int64 added_domains = 0;
   for (HloComputation* computation : module->computations()) {
     // Walk in post order and place all the required kDomain instructions.
@@ -56,9 +54,6 @@
     }
   }
   VLOG(3) << "Added " << added_domains << " kDomain instructions";
-  if (added_domains > 0) {
-    hlo_graph_dumper::MaybeDumpHloModule(*module, "After Domain Isolator");
-  }
   return added_domains > 0;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.cc b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
index 67fad07..4975c3f 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_remover.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
@@ -59,8 +59,6 @@
 
 StatusOr<bool> HloDomainRemover::RunContext::Run() {
   VLOG(4) << "Processing metadata domain: '" << remover_->kind_ << "'";
-  hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before Domain Remover");
-
   int64 removed_domains = 0;
   for (HloComputation* computation : module_->computations()) {
     // First create the domain instruciton sets. A domain instruction set is
@@ -97,9 +95,6 @@
   }
   VLOG(3) << "Removed " << removed_domains << " kDomain instructions of '"
           << remover_->kind_ << "' kind";
-  if (removed_domains > 0) {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "After Domain Remover");
-  }
   return removed_domains > 0;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 89e8fe3..116b32f 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -41,17 +41,18 @@
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
-namespace hlo_graph_dumper {
 namespace {
 
 using absl::nullopt;
@@ -1257,36 +1258,6 @@
   return instr;
 }
 
-class GraphRendererRegistry {
- public:
-  void SetRenderer(std::shared_ptr<GraphRendererInterface> graph_renderer) {
-    tensorflow::mutex_lock lock(mu_);
-    graph_renderer_ = graph_renderer;
-  }
-
-  std::shared_ptr<GraphRendererInterface> GetDefaultRenderer() {
-    tensorflow::mutex_lock lock(mu_);
-    return graph_renderer_;
-  }
-
-  static GraphRendererRegistry* Default() {
-    static GraphRendererRegistry* registry = new GraphRendererRegistry();
-    return registry;
-  }
-
- private:
-  tensorflow::mutex mu_;
-  std::shared_ptr<GraphRendererInterface> graph_renderer_ GUARDED_BY(mu_);
-};
-
-}  // namespace
-
-Registrar::Registrar(std::shared_ptr<GraphRendererInterface> dumper) {
-  GraphRendererRegistry::Default()->SetRenderer(dumper);
-}
-
-namespace {
-
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
 NodeFilter MakeNodeRadiusAroundFilter(
@@ -1449,157 +1420,7 @@
   });
 }
 
-string SaveGraph(const string& graph,
-                 GraphRendererInterface::GraphKind graph_kind,
-                 const string& dest_path) {
-  static std::atomic<int> output_num(0);
-  string file_extension;
-  switch (graph_kind) {
-    case GraphRendererInterface::DOT_GRAPH:
-      file_extension = ".dot";
-      break;
-  }
-  string path = JoinPath(dest_path, StrCat("hlo_graph_", output_num++, "."));
-  auto status = Status::OK();
-  auto env = tensorflow::Env::Default();
-  if (!env->CreateUniqueFileName(&path, file_extension)) {
-    status =
-        Status(tensorflow::error::Code::UNKNOWN,
-               StrCat("Failed to create temporary file to dump HLO graph: ",
-                      strerror(errno)));
-  } else {
-    status = tensorflow::WriteStringToFile(env, path, graph);
-  }
-  if (!status.ok()) {
-    LOG(WARNING) << "Saving HLO graph failed: " << status;
-  }
-  return path;
-}
-
-string ExportGraph(const string& graph,
-                   GraphRendererInterface::GraphKind graph_kind,
-                   const DebugOptions& debug_options) {
-  string path = debug_options.xla_hlo_graph_path();
-  if (!path.empty() && !debug_options.xla_hlo_dump_as_html()) {
-    return SaveGraph(graph, graph_kind, path);
-  } else {
-    auto graph_renderer =
-        GraphRendererRegistry::Default()->GetDefaultRenderer();
-    CHECK(graph_renderer != nullptr)
-        << "No registered renderer for the HLO graph. "
-           "Use --xla_hlo_graph_path=PATH --xla_hlo_dump_as_html=false to "
-           "export to local file system";
-    return graph_renderer->RenderGraph(graph, graph_kind, debug_options);
-  }
-}
-
-}  // namespace
-
-string HloComputationToDotGraph(const HloComputation& computation,
-                                const DotGraphOptions& options) {
-  DebugOptions default_debug_options;
-  return HloDotDumper(&computation, options.label,
-                      options.debug_options ? *options.debug_options
-                                            : default_debug_options,
-                      options.show_backend_config, options.profile,
-                      NodeFilter())
-      .Dump();
-}
-
-string DumpGraph(const HloComputation& computation, const string& label,
-                 const DebugOptions& debug_options,
-                 const HloExecutionProfile* hlo_execution_profile,
-                 bool show_backend_config) {
-  GraphRendererInterface::GraphKind graph_kind;
-  string graph =
-      HloDotDumper(&computation, label, debug_options, show_backend_config,
-                   hlo_execution_profile, NodeFilter())
-          .Dump();
-  graph_kind = GraphRendererInterface::DOT_GRAPH;
-
-  string graph_url = ExportGraph(graph, graph_kind, debug_options);
-  LOG(INFO) << "computation " << computation.name() << " [" << label
-            << "]: " << graph_url;
-  return graph_url;
-}
-
-string DumpNeighborhoodAround(
-    const HloInstruction& node, int radius, bool show_backend_config,
-    const absl::flat_hash_set<const HloInstruction*>& boundary) {
-  auto debug_options = node.GetModule()->config().debug_options();
-  string label =
-      StrCat("Neighborhood of ", radius, " nodes around ", node.name());
-  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius, boundary);
-  string graph =
-      HloDotDumper(node.parent(), label, debug_options, show_backend_config,
-                   /*profile=*/nullptr, filter)
-          .Dump();
-  return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
-}
-
-string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
-                          int64 max_nodes, bool show_backend_config) {
-  CHECK_EQ(from.parent(), to.parent()) << "Nodes must be in same computation!";
-  auto debug_options = from.GetModule()->config().debug_options();
-
-  bool hit_limit = false;
-  NodeFilter filter = MakeNodeFromToFilter(&from, &to, max_nodes, &hit_limit);
-  string label;
-  if (!hit_limit) {
-    label = StrCat("All paths from ", from.name(), " to ", to.name());
-  } else {
-    label = StrCat(max_nodes, " nodes on the shortest paths from ", from.name(),
-                   " to ", to.name(),
-                   "<br/><br/>***SHOWING ONLY A SUBSET OF ALL PATHS BETWEEN "
-                   "NODES***<br/><br/>");
-  }
-  string graph =
-      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
-                   /*profile=*/nullptr, filter)
-          .Dump();
-  return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
-}
-
-void DumpText(const HloModule& module, const string& label,
-              const string& directory_path, bool do_prefix) {
-  Env* env = Env::Default();
-  TF_CHECK_OK(env->RecursivelyCreateDir(directory_path));
-  string prefix = StrCat(env->NowMicros());
-  string filename =
-      do_prefix ? StrCat(prefix, "-", label, ".txt") : StrCat(label, ".txt");
-  string path = JoinPath(directory_path, filename);
-  TF_CHECK_OK(WriteStringToFile(
-      env, path,
-      module.ToString(HloPrintOptions().set_print_large_constants(true))));
-  LOG(INFO) << "dumping module '" << module.name() << "' to " << path;
-}
-
-string MaybeDumpHloModule(const HloModule& module, const string& label,
-                          const HloExecutionProfile* profile) {
-  const DebugOptions& debug_options = module.config().debug_options();
-  VLOG(2) << "MaybeDumpHloModule called on module " << module.name()
-          << " with generate_hlo_graph regex \""
-          << debug_options.xla_generate_hlo_graph() << "\"";
-  string graph_url;
-  if (!debug_options.xla_generate_hlo_graph().empty() &&
-      RE2::PartialMatch(module.name(),
-                        debug_options.xla_generate_hlo_graph())) {
-    graph_url =
-        DumpGraph(*module.entry_computation(), label, debug_options, profile);
-  }
-  if (!debug_options.xla_log_hlo_text().empty() &&
-      RE2::PartialMatch(module.name(), debug_options.xla_log_hlo_text())) {
-    LOG(INFO) << "HLO for module " << module.name();
-    LOG(INFO) << "Label: " << label;
-    XLA_LOG_LINES(2, module.ToString());
-  }
-  if (!debug_options.xla_generate_hlo_text_to().empty()) {
-    DumpText(module, label, debug_options.xla_generate_hlo_text_to());
-  }
-  return graph_url;
-}
-
-string WrapDotInHTML(const string& dot) {
+string WrapDotInHtml(absl::string_view dot) {
   static const char html_prefix[] = R"html(
 <!DOCTYPE html>
 <html>
@@ -1640,6 +1461,9 @@
     var css_data = ''
     if (results !== null) {
         css_data = results[1].replace(/\s*data:.*\s*,/,''); // Strip content-type field.
+        // CSS inside DOT is URL-escaped, so we must unescape it
+        // before we can insert it into SVG.
+        css_data = unescape(css_data);
         dot_data = data.replace(cssregex, ''); // Remove the stylesheet
     }
 
@@ -1707,37 +1531,117 @@
 </html>
 )html";
 
-  return html_prefix + dot + html_suffix;
+  return absl::StrCat(html_prefix, dot, html_suffix);
 }
 
-string RenderDotAsHTMLFile(const string& dot,
-                           const DebugOptions& debug_options) {
-  string html = WrapDotInHTML(dot);
+tensorflow::mutex url_renderer_mu(tensorflow::LINKER_INITIALIZED);
+std::function<StatusOr<string>(absl::string_view)>* url_renderer
+    GUARDED_BY(url_renderer_mu) = nullptr;
 
-  auto env = tensorflow::Env::Default();
-  std::vector<string> dirs;
-  string output_dir = debug_options.xla_hlo_graph_path();
-  if (output_dir.empty()) {
-    env->GetLocalTempDirectories(&dirs);
+// Precondition: url_renderer != nullptr.
+//
+// (We specify this as a precondition rather than checking it in here and
+// returning an error because we want to fail quickly when there's no URL
+// renderer available, and this function runs only after we've done all the work
+// of producing dot for the graph.)
+StatusOr<string> WrapDotInFormat(absl::string_view dot,
+                                 RenderedGraphFormat format)
+    EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
+  switch (format) {
+    case RenderedGraphFormat::kUrl:
+      CHECK(url_renderer != nullptr)
+          << "Should have checked url_renderer != null before calling.";
+      return (*url_renderer)(dot);
+    case RenderedGraphFormat::kHtml:
+      return WrapDotInHtml(dot);
+    case RenderedGraphFormat::kDot:
+      return string(dot);
+  }
+}
+
+}  // namespace
+
+void RegisterGraphToURLRenderer(
+    std::function<StatusOr<string>(absl::string_view)> renderer) {
+  tensorflow::mutex_lock lock(url_renderer_mu);
+  if (url_renderer != nullptr) {
+    LOG(WARNING) << "Multiple calls to RegisterGraphToURLRenderer.  Last call "
+                    "wins, but because order of initialization in C++ is "
+                    "nondeterministic, this may not be what you want.";
+  }
+  delete url_renderer;
+  url_renderer = new std::function<StatusOr<string>(absl::string_view)>(
+      std::move(renderer));
+}
+
+StatusOr<string> RenderGraph(const HloComputation& computation,
+                             absl::string_view label,
+                             const DebugOptions& debug_options,
+                             RenderedGraphFormat format,
+                             const HloExecutionProfile* hlo_execution_profile,
+                             bool show_backend_config) {
+  tensorflow::mutex_lock lock(url_renderer_mu);
+  if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
+    return Unavailable("Can't render as URL; no URL renderer was registered.");
+  }
+
+  string rendered_dot =
+      HloDotDumper(&computation, label, debug_options, show_backend_config,
+                   hlo_execution_profile, NodeFilter())
+          .Dump();
+  return WrapDotInFormat(rendered_dot, format);
+}
+
+StatusOr<string> RenderNeighborhoodAround(
+    const HloInstruction& node, int radius, RenderedGraphFormat format,
+    bool show_backend_config,
+    const absl::flat_hash_set<const HloInstruction*>& boundary) {
+  tensorflow::mutex_lock lock(url_renderer_mu);
+  if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
+    return FailedPrecondition(
+        "Can't render as URL; no URL renderer was registered.");
+  }
+
+  string label =
+      StrCat("Neighborhood of ", radius, " nodes around ", node.name());
+  string rendered_dot =
+      HloDotDumper(node.parent(), label,
+                   node.GetModule()->config().debug_options(),
+                   show_backend_config, /*profile=*/nullptr,
+                   MakeNodeRadiusAroundFilter(&node, radius, boundary))
+          .Dump();
+  return WrapDotInFormat(rendered_dot, format);
+}
+
+StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
+                                      const HloInstruction& to, int64 max_nodes,
+                                      RenderedGraphFormat format,
+                                      bool show_backend_config) {
+  tensorflow::mutex_lock lock(url_renderer_mu);
+  if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
+    return FailedPrecondition(
+        "Can't render as URL; no URL renderer was registered.");
+  }
+
+  CHECK_EQ(from.parent(), to.parent()) << "Nodes must be in same computation!";
+  auto debug_options = from.GetModule()->config().debug_options();
+
+  bool hit_limit = false;
+  NodeFilter filter = MakeNodeFromToFilter(&from, &to, max_nodes, &hit_limit);
+  string label;
+  if (!hit_limit) {
+    label = StrCat("All paths from ", from.name(), " to ", to.name());
   } else {
-    dirs.push_back(output_dir);
+    label = StrCat(max_nodes, " nodes on the shortest paths from ", from.name(),
+                   " to ", to.name(),
+                   "<br/><br/>***SHOWING ONLY A SUBSET OF ALL PATHS BETWEEN "
+                   "NODES***<br/><br/>");
   }
-  // Try each directory, as they might be full, have inappropriate
-  // permissions or have different problems at times.
-  string output;
-  for (const string& dir : dirs) {
-    string filename = tensorflow::io::JoinPath(dir, "graph-");
-    if (env->CreateUniqueFileName(&filename, ".html")) {
-      output = filename;
-      break;
-    }
-  }
-  if (output.empty()) {
-    LOG(FATAL) << "Failed to create unique output file name.";
-  }
-  TF_CHECK_OK(tensorflow::WriteStringToFile(env, output, html));
-  return "file://" + output;
+  string rendered_dot =
+      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+                   /*profile=*/nullptr, filter)
+          .Dump();
+  return WrapDotInFormat(rendered_dot, format);
 }
 
-}  // namespace hlo_graph_dumper
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 563cea4..324ac67 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -23,52 +23,47 @@
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 
+// This file contains routines for rendering HLO computations into a
+// human-readable graphical format.
+//
+// Fundamentally all graphs are rendered using the DOT language, but they can be
+// packaged three different ways:
+//
+//  - as a raw DOT file, which can be rendered using `graphviz`.
+//
+//  - as an HTML file with an embedded DOT file, which can be viewed in a
+//    browser using a version of graphviz compiled to JavaScript
+//
+//  - as a URL hosted somewhere which somehow embeds the DOT file.
+//
+// This last option is not implemented by default, but you can add a plugin to
+// implement it via RegisterGraphToURLRenderer.
+//
+// TODO(jlebar): Rename this file to hlo_graph_renderer.
+
 namespace xla {
-namespace hlo_graph_dumper {
 
-// Converts a HLO module to a DOT (graphviz) graph. Returns the dot graph as
-// a string.
-struct DotGraphOptions {
-  absl::string_view label;
-  const DebugOptions* debug_options = nullptr;
-  const HloExecutionProfile* profile = nullptr;
-  bool show_backend_config = false;
-};
-string HloComputationToDotGraph(const HloComputation& computation,
-                                const DotGraphOptions& options);
-
-// Abstract interface for classes that render HLO graphs (e.g. DOT graph,
-// tensorflow GraphDef) to files or services.
-class GraphRendererInterface {
- public:
-  enum GraphKind {
-    DOT_GRAPH,
-  };
-
-  virtual ~GraphRendererInterface() = default;
-
-  // Renders a DOT graph, returning a description of the rendered output
-  // (e.g., a URL)
-  virtual string RenderGraph(const string& graph, GraphKind graph_kind,
-                             const DebugOptions& debug_options) = 0;
+// Different formats that a graph can be packaged as.
+enum class RenderedGraphFormat {
+  kDot,
+  kHtml,
+  kUrl,
 };
 
-// Dump the given HLO module if a dump is requested in its debug options. Based
-// on the debug options, either a graph dump, a text dump or both may be
-// generated. If a graph dump is generated, the description (e.g. an URL) is
-// returned; otherwise an empty string is returned.
-string MaybeDumpHloModule(const HloModule& module, const string& label,
-                          const HloExecutionProfile* profile = nullptr);
+// Renders an HLO module as a human-readable visual graph.
+//
+// Note that this only works well for relatively small graphs (no more than a
+// few hundred nodes).  Beyond that, the dot is usually unrenderable,
+// unreadable, or both.  To view such graphs, use a tool such as
+// interactive_graphviz, which calls RenderNeighborhoodAround to render subsets
+// of a graph.
+StatusOr<string> RenderGraph(
+    const HloComputation& computation, absl::string_view label,
+    const DebugOptions& debug_options, RenderedGraphFormat format,
+    const HloExecutionProfile* hlo_execution_profile = nullptr,
+    bool show_backend_config = false);
 
-// Dumps a graph of the computation and returns a description of the rendered
-// graph (e.g., a URL) based on the renderer. The "best" renderer in the
-// registry is used.
-string DumpGraph(const HloComputation& computation, const string& label,
-                 const DebugOptions& debug_options,
-                 const HloExecutionProfile* hlo_execution_profile = nullptr,
-                 bool show_backend_config = false);
-
-// Like DumpGraph, but renders only nodes "near" the given node in the graph.
+// Like RenderGraph, but renders only nodes "near" the given node in the graph.
 //
 // The number of nodes dumped is controlled by the radius parameter, which
 // (roughly) corresponds to the max distance a node may be from the primary node
@@ -76,55 +71,28 @@
 //
 // The optional boundary specifies a set of boundary nodes, beyond which nodes
 // will be omitted even if they are within the radius.
-string DumpNeighborhoodAround(
-    const HloInstruction& node, int radius, bool show_backend_config = false,
+StatusOr<string> RenderNeighborhoodAround(
+    const HloInstruction& node, int radius, RenderedGraphFormat format,
+    bool show_backend_config = false,
     const absl::flat_hash_set<const HloInstruction*>& boundary = {});
 
-// Dumps nodes on any of the paths from `from` to `to`.  If there are more than
-// max_nodes on all paths, restricts to the max_nodes nodes on the shortest
+// Renders nodes on any of the paths from `from` to `to`.  If there are more
+// than max_nodes on all paths, restricts to the max_nodes nodes on the shortest
 // paths.
-string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
-                          int64 max_nodes, bool show_backend_config = false);
+StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
+                                      const HloInstruction& to, int64 max_nodes,
+                                      RenderedGraphFormat format,
+                                      bool show_backend_config = false);
 
-// Dumps the HloModule::ToString() as a file into the provided directory path
-// suffixed with the provided label.
+// Registers a function which implements RenderedGraphFormat::kUrl.
 //
-// If do_prefix is true, a timestamp will be prepended onto the label to
-// construct a filename in the directory path; otherwise, the label is used
-// as the filename directly.
-void DumpText(const HloModule& module, const string& label,
-              const string& directory_path, bool do_prefix = true);
+// The input to the function is dot, and the output should be a URL or an error.
+//
+// There can only be one active renderer, and the last call to this function
+// wins.
+void RegisterGraphToURLRenderer(
+    std::function<StatusOr<string>(absl::string_view dot)> renderer);
 
-// Renders DOT graph as inline SVG and saves it in an HTML file in a temprary
-// directory or directory specified via --xla_hlo_graph_path. Returns the file
-// URI pointing to the file.
-string RenderDotAsHTMLFile(const string& dot,
-                           const DebugOptions& debug_options);
-
-// Graph renderers may be added using a registration mechanism, e.g.:
-// XLA_REGISTER_GRAPH_RENDERER(AGraphRendererClass, 100)
-// The renderer with the highest numeric priority value is used.
-
-#define XLA_REGISTER_GRAPH_RENDERER(factory, ...) \
-  XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, __COUNTER__, ##__VA_ARGS__)
-
-// Internal implementation details below this point.
-
-// Class that registers a graph renderer.
-class Registrar {
- public:
-  Registrar(std::shared_ptr<GraphRendererInterface> dumper);
-};
-
-#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, ctr, ...) \
-  static ::xla::hlo_graph_dumper::Registrar                     \
-      XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr)(           \
-          std::make_shared<factory>(), ##__VA_ARGS__)
-
-// __COUNTER__ must go through another macro to be properly expanded
-#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr) ___##ctr##__object_
-
-}  // namespace hlo_graph_dumper
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GRAPH_DUMPER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
index f92759c..fa1ff49 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@@ -38,19 +38,6 @@
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
 
-class DotRenderer : public hlo_graph_dumper::GraphRendererInterface {
- public:
-  string RenderGraph(const string& graph, GraphKind graph_kind,
-                     const DebugOptions& debug_options) override {
-    return graph;
-  }
-
- private:
-  string last_graph_;
-};
-
-XLA_REGISTER_GRAPH_RENDERER(DotRenderer);
-
 TEST_F(HloGraphDumperTest, NestedFusion) {
   HloComputation::Builder b("b");
 
@@ -93,8 +80,9 @@
           {fused_sums[1], fused_sums[0]}, HloInstruction::FusionKind::kLoop);
 
   // Generate the graph; all nodes should be present.
-  string graph = hlo_graph_dumper::DumpGraph(*root_computation, /*label=*/"",
-                                             DebugOptions());
+  TF_ASSERT_OK_AND_ASSIGN(
+      string graph, RenderGraph(*root_computation, /*label=*/"", DebugOptions(),
+                                RenderedGraphFormat::kDot));
   for (const HloComputation* computation :
        {root_computation,  //
         inner_fusion->fused_instructions_computation(),
@@ -116,9 +104,10 @@
     }
   }
   ASSERT_NE(inner_sum, nullptr);
-  EXPECT_THAT(
-      hlo_graph_dumper::DumpNeighborhoodAround(*inner_sum, /*radius=*/1),
-      HasSubstr(inner_sum->name()));
+  TF_ASSERT_OK_AND_ASSIGN(string neighborhood_graph,
+                          RenderNeighborhoodAround(*inner_sum, /*radius=*/1,
+                                                   RenderedGraphFormat::kDot));
+  EXPECT_THAT(neighborhood_graph, HasSubstr(inner_sum->name()));
 }
 
 TEST_F(HloGraphDumperTest, Constant) {
@@ -129,8 +118,9 @@
   HloModuleConfig config;
   HloModule m(TestName(), config);
   HloComputation* root_computation = m.AddEntryComputation(b.Build());
-  string graph = hlo_graph_dumper::DumpGraph(
-      *root_computation, /*label=*/"an_empty_graph", DebugOptions());
+  TF_ASSERT_OK_AND_ASSIGN(
+      string graph, RenderGraph(*root_computation, /*label=*/"an_empty_graph",
+                                DebugOptions(), RenderedGraphFormat::kDot));
   EXPECT_THAT(graph, HasSubstr("an_empty_graph"));
   EXPECT_THAT(graph, Not(HasSubstr("i_am_a_constant_root_instruction")));
 }
@@ -147,8 +137,9 @@
   HloModuleConfig config;
   HloModule m(TestName(), config);
   HloComputation* root_computation = m.AddEntryComputation(b.Build(gte));
-  string graph = hlo_graph_dumper::DumpGraph(
-      *root_computation, /*label=*/"tuple_constant", DebugOptions());
+  TF_ASSERT_OK_AND_ASSIGN(
+      string graph, RenderGraph(*root_computation, /*label=*/"tuple_constant",
+                                DebugOptions(), RenderedGraphFormat::kDot));
   EXPECT_THAT(graph, HasSubstr("tuple_constant"));
   EXPECT_THAT(graph, HasSubstr("constant (f32[3,2], s32[4,5])"));
 }
@@ -164,8 +155,10 @@
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
-  string graph = hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                             /*label=*/"comp", DebugOptions());
+  TF_ASSERT_OK_AND_ASSIGN(
+      string graph,
+      RenderGraph(*module->entry_computation(), /*label=*/"tuple_constant",
+                  DebugOptions(), RenderedGraphFormat::kDot));
   EXPECT_THAT(graph, HasSubstr("direction=LT"));
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc b/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc
deleted file mode 100644
index 84c4cf1..0000000
--- a/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Implementation of an DOT graph renderer that uses Javascript to render DOT to
-// SVG in a browser.
-
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-namespace {
-
-class GraphHtmlRenderer : public GraphRendererInterface {
- public:
-  string RenderGraph(const string& graph, GraphKind graph_kind,
-                     const DebugOptions& debug_options) override {
-    switch (graph_kind) {
-      case DOT_GRAPH:
-        return RenderDotAsHTMLFile(graph, debug_options);
-      default:
-        LOG(FATAL) << "Only DOT graphs can be rendered";
-    }
-  }
-};
-
-XLA_REGISTER_GRAPH_RENDERER(GraphHtmlRenderer);
-
-}  // namespace
-}  // namespace hlo_graph_dumper
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 68c1883..cee46fe 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -20,6 +20,7 @@
 
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -101,6 +102,20 @@
     return intra_op_parallelism_threads_;
   }
 
+  // Checks if this config has a static device assignment.
+  bool has_static_device_assignment() const {
+    return static_device_assignment_.has_value();
+  }
+
+  // Getter and setter of the compile-time known device assignment.
+  const DeviceAssignment& static_device_assignment() const {
+    CHECK(static_device_assignment_.has_value());
+    return *static_device_assignment_;
+  }
+  void set_static_device_assignment(const DeviceAssignment& device_assignment) {
+    static_device_assignment_ = device_assignment;
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -117,6 +132,9 @@
   int64 intra_op_parallelism_threads_ = -1;
 
   DebugOptions debug_options_;
+
+  // Compile-time known device assignment.
+  absl::optional<DeviceAssignment> static_device_assignment_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index ae8c08c..0ca04cf 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -21,6 +21,7 @@
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -99,30 +100,8 @@
 void HloPassPipeline::MaybeDumpHlo(const HloModule& module,
                                    absl::string_view after_pass_name,
                                    absl::string_view before_pass_name) {
-  const string& proto_dump_path =
-      module.config().debug_options().xla_dump_per_pass_hlo_proto_to();
-  if (!proto_dump_path.empty()) {
-    static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
-    static auto* const module_id_to_pass_number =
-        new absl::flat_hash_map<int64, int64>();
-
-    tensorflow::mutex_lock lock(mu);
-    const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++;
-
-    const string filename = SanitizeFileName(
-        absl::StrFormat("module_%04d.%04d.%s.after_%s", module.unique_id(),
-                        pass_number, name(), after_pass_name));
-
-    TF_QCHECK_OK(protobuf_util::DumpProtoToDirectory(
-        MakeHloProto(module), proto_dump_path, filename));
-  }
-
-  const string message =
-      absl::StrCat("after ", after_pass_name, ", before ", before_pass_name);
-  hlo_graph_dumper::MaybeDumpHloModule(module, message);
-  VLOG(3) << "HLO " << message << ":";
-  VLOG(3) << module.entry_computation_layout().ToString();
-  XLA_VLOG_LINES(3, module.ToString());
+  DumpHloModuleBetweenPassesIfEnabled(name(), before_pass_name, after_pass_name,
+                                      module);
 }
 
 void HloPassPipeline::MaybeDumpHlo(const HloModuleGroup& module_group,
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 8f44e1b..8373677 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -271,13 +271,10 @@
 
 StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     std::unique_ptr<HloModule> module, const ReplicatedExecuteOptions& options,
-    bool use_threads) {
+    DeviceAssignment* device_assignment, bool use_threads) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       CreateExecutable(std::move(module), options.run_hlo_passes));
-  TF_ASSIGN_OR_RETURN(
-      DeviceAssignment device_assignment,
-      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
   std::vector<std::unique_ptr<se::Stream>> streams;
   std::vector<ServiceExecutableRunOptions> service_run_options;
 
@@ -294,13 +291,13 @@
   std::vector<absl::Span<const ShapedBuffer* const>> argument_buffer_slices;
   int64 index = 0;
   for (int64 i = 0; i < options.num_replicas; ++i) {
-    int64 device = device_assignment(i, 0);
+    int64 device = (*device_assignment)(i, 0);
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                         backend().stream_executor(device));
     streams.push_back(absl::make_unique<se::Stream>(executor));
     streams.back()->Init();
     service_run_options.emplace_back(GetServiceRunOptionsForDevice(
-        device, streams.back().get(), &device_assignment));
+        device, streams.back().get(), device_assignment));
 
     // Copy arguments to device.
     for (const Literal* argument : options.arguments) {
@@ -330,7 +327,7 @@
   }
   if (options.infeed != nullptr) {
     for (int64 i = 0; i < options.num_replicas; ++i) {
-      int64 device = device_assignment(i, 0);
+      int64 device = (*device_assignment)(i, 0);
       pool->Schedule([this, device, &options]() {
         se::StreamExecutor* executor =
             backend().stream_executor(device).ValueOrDie();
@@ -348,7 +345,7 @@
   }
   if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
     for (int64 i = 0; i < options.num_replicas; ++i) {
-      int64 device = device_assignment(i, 0);
+      int64 device = (*device_assignment)(i, 0);
       pool->Schedule([this, device, &options]() {
         se::StreamExecutor* executor =
             backend().stream_executor(device).ValueOrDie();
@@ -416,6 +413,16 @@
   return std::move(exec_results);
 }
 
+StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
+    std::unique_ptr<HloModule> module, const ReplicatedExecuteOptions& options,
+    bool use_threads) {
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+  return ExecuteReplicated(std::move(module), options, &device_assignment,
+                           use_threads);
+}
+
 StatusOr<std::unique_ptr<Executable>> HloRunner::CreateExecutable(
     std::unique_ptr<HloModule> module, bool run_hlo_passes) {
   if (run_hlo_passes) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 88a137e..0c1ae3a 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -173,6 +173,12 @@
       std::unique_ptr<HloModule> module,
       const ReplicatedExecuteOptions& options, bool use_threads = false);
 
+  // Same as above, but with specified device assignment.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment, bool use_threads = false);
+
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
   //
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
index c107391..6c0a192 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
@@ -16,7 +16,6 @@
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -87,17 +86,7 @@
 
   EXPECT_EQ(3, module->computation_count());
   EXPECT_NE(x->to_apply(), y->to_apply());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before unification",
-                                module->config().debug_options());
-  }
   EXPECT_TRUE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after unification",
-                                module->config().debug_options());
-  }
   EXPECT_EQ(2, module->computation_count());
   EXPECT_EQ(x->to_apply(), y->to_apply());
 }
@@ -126,17 +115,7 @@
 
   EXPECT_EQ(3, module->computation_count());
   EXPECT_NE(x->to_apply(), y->to_apply());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before unification",
-                                module->config().debug_options());
-  }
   EXPECT_TRUE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after unification",
-                                module->config().debug_options());
-  }
   EXPECT_EQ(2, module->computation_count());
   EXPECT_EQ(x->to_apply(), y->to_apply());
 }
@@ -166,17 +145,7 @@
 
   EXPECT_EQ(3, module->computation_count());
   EXPECT_NE(x->to_apply(), y->to_apply());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before unification",
-                                module->config().debug_options());
-  }
   EXPECT_FALSE(HloSubcomputationUnification().Run(module.get()).ValueOrDie());
-  if (VLOG_IS_ON(1)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after unification",
-                                module->config().debug_options());
-  }
   EXPECT_EQ(3, module->computation_count());
   EXPECT_NE(x->to_apply(), y->to_apply());
 }
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index bff7c96..039954a 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -37,7 +37,6 @@
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -1917,12 +1916,6 @@
 
 StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   VLOG(2) << "Running layout assignment on module " << module->name();
-  XLA_VLOG_LINES(3, module->ToString());
-  if (VLOG_IS_ON(10)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "before layout assignment",
-                                module->config().debug_options());
-  }
   TF_RETURN_IF_ERROR(Init());
 
   // Verify computation layout is sane.
@@ -1977,13 +1970,6 @@
                                                  entry_computation_layout_));
   TF_RETURN_IF_ERROR(CheckLayouts(module));
 
-  VLOG(3) << "After layout assignment:";
-  XLA_VLOG_LINES(3, module->ToString());
-  if (VLOG_IS_ON(10)) {
-    hlo_graph_dumper::DumpGraph(*module->entry_computation(),
-                                "after layout assignment",
-                                module->config().debug_options());
-  }
   // All layouts are reset then reassigned by this pass.
   return true;
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index be6371a..2b81dc8 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -67,6 +67,7 @@
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:name_uniquer",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 12e5528..cc21a9f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -31,6 +31,7 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -567,14 +568,6 @@
   return result;
 }
 
-static string GetProcessUniqueIrFileName(absl::string_view prefix) {
-  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
-  static NameUniquer* uniquer = new NameUniquer(/*separator=*/"-");
-
-  tensorflow::mutex_lock lock(mu);
-  return uniquer->GetUniqueName(prefix);
-}
-
 static Status CreateAndWriteStringToFile(const string& directory_name,
                                          const string& file_name,
                                          const string& text) {
@@ -588,29 +581,27 @@
   return Status::OK();
 }
 
-Status DumpIRToDirectory(const string& directory_name,
-                         const string& hlo_module_name,
-                         const llvm::Module& llvm_module, bool optimized) {
+void DumpIrIfEnabled(const HloModule& hlo_module,
+                     const llvm::Module& llvm_module, bool optimized) {
+  const auto& debug_opts = hlo_module.config().debug_options();
+  if (!DumpingEnabledForHloModule(hlo_module)) {
+    return;
+  }
   // We can end up compiling different modules with the same name when using
   // XlaJitCompiledCpuFunction::Compile.  Avoid overwriting IR files previously
   // dumped from the same process in such cases.
-  string unique_and_safe_file_name = GetProcessUniqueIrFileName(
-      absl::StrCat("ir-", SanitizeFileName(hlo_module_name), "-",
-                   optimized ? "with" : "no", "-opt"));
-
-  string ir_file_name = tensorflow::io::JoinPath(
-      directory_name, absl::StrCat(unique_and_safe_file_name, ".ll"));
+  string suffix = absl::StrCat("ir-", optimized ? "with" : "no", "-opt");
+  DumpToFileInDirOrStdout(hlo_module, absl::StrCat(suffix, ".ll"),
+                          DumpModuleToString(llvm_module));
 
   // For some models the embedded constants can be huge, so also dump the module
-  // with the constants stripped to get IR that is easier to manipulate.
-  string ir_no_constant_initializers_file_name = tensorflow::io::JoinPath(
-      directory_name, absl::StrCat(unique_and_safe_file_name, "-noconst.ll"));
-
-  TF_RETURN_IF_ERROR(CreateAndWriteStringToFile(
-      directory_name, ir_file_name, DumpModuleToString(llvm_module)));
-  return CreateAndWriteStringToFile(
-      directory_name, ir_no_constant_initializers_file_name,
-      DumpModuleToString(*DropConstantInitializers(llvm_module)));
+  // with the constants stripped to get IR that is easier to manipulate.  Skip
+  // this if we're dumping to stdout; there's no point in duplicating everything
+  // when writing to the terminal.
+  if (!DumpingToStdout(debug_opts)) {
+    DumpToFileInDir(hlo_module, absl::StrCat(suffix, "-noconst.ll"),
+                    DumpModuleToString(*DropConstantInitializers(llvm_module)));
+  }
 }
 
 llvm::Function* CreateFunction(llvm::FunctionType* function_type,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 4f3e861..a960f3b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -278,14 +278,14 @@
     llvm::LLVMContext* context, const std::map<int, llvm::MDNode*>& a,
     const std::map<int, llvm::MDNode*>& b);
 
-// Dumps out `llvm_module` to a file in the directory named `directory_name`,
-// creating the directory if necessary.  A sanitized version of
-// `hlo_module_name` is incorporated into the file name.  If `optimized` is true
-// then a suffix of "-with-opt.ll" is used, else a suffix of "-no-opt.ll" is
-// used.
-Status DumpIRToDirectory(const string& directory_name,
-                         const string& hlo_module_name,
-                         const llvm::Module& llvm_module, bool optimized);
+// Dumps out `llvm_module` to the path specified in DebugOptions, if dumping is
+// enabled for the given HLO module.
+//
+// A sanitized version of `hlo_module_name` is incorporated into the file name.
+// If `optimized` is true then a suffix of "-with-opt.ll" is used, else a suffix
+// of "-no-opt.ll" is used.
+void DumpIrIfEnabled(const HloModule& hlo_module,
+                     const llvm::Module& llvm_module, bool optimized);
 
 llvm::Function* CreateFunction(llvm::FunctionType* function_type,
                                llvm::GlobalValue::LinkageTypes linkage,
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 9bda6fb..91efbdd 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -29,6 +29,7 @@
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -63,6 +64,10 @@
 using absl::StrCat;
 using absl::StrFormat;
 
+// Argument used when calling DumpHloModuleIfEnabled before optimizations are
+// performed on an HloModule.
+constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
+
 // Records the arguments used to invoke a computation in an HloSnapshot proto.
 Status RecordArguments(const absl::Span<const ShapedBuffer* const> arguments,
                        se::Stream* stream, TransferManager* transfer_manager,
@@ -338,21 +343,8 @@
   // Dump computation proto state if flag is set.
   std::vector<std::unique_ptr<HloSnapshot>> hlo_snapshots;
   for (int64 i = 0; i < module_protos.size(); ++i) {
-    const string& directory_path =
-        module_configs[i]->debug_options().xla_dump_computations_to();
-    const string& execution_directory_path =
-        module_configs[i]->debug_options().xla_dump_executions_to();
-    if (directory_path.empty() && execution_directory_path.empty()) {
-      continue;
-    }
     auto hlo_snapshot = absl::make_unique<HloSnapshot>();
     *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i];
-    if (!directory_path.empty()) {
-      string filename = StrFormat("computation_%d__%s", module_protos[i]->id(),
-                                  module_protos[i]->entry_computation_name());
-      TF_RETURN_IF_ERROR(
-          Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot));
-    }
     hlo_snapshots.push_back(std::move(hlo_snapshot));
   }
 
@@ -368,7 +360,7 @@
     const HloModuleProto* proto = module_protos[i];
     const HloModuleConfig& config = *module_configs[i];
     TF_ASSIGN_OR_RETURN(auto module, CreateModuleFromProto(*proto, config));
-    TF_RETURN_IF_ERROR(MaybeDumpUnoptimizedHloModule(*module));
+    DumpHloModuleIfEnabled(*module, kBeforeOptimizationsDumpName);
     module_group->push_back(std::move(module));
   }
 
@@ -378,7 +370,9 @@
                                    std::move(executors), device_allocator));
 
   for (size_t i = 0; i < module_protos.size(); ++i) {
-    if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
+    const auto& debug_opts = module_configs[i]->debug_options();
+    if (DumpingEnabledForHloModule(module_protos[i]->name(), debug_opts) &&
+        debug_opts.xla_dump_hlo_snapshots()) {
       executables[i]->set_hlo_snapshot(std::move(hlo_snapshots[i]));
     }
   }
@@ -476,24 +470,6 @@
     }
   }
 
-  // For every stream that had profiling enabled, obtain and debug-dump the HLO
-  // profile.
-  for (auto& index_to_profiled_stream : index_to_profiled_streams) {
-    int64 device = index_to_profiled_stream.first;
-    se::Stream* stream = index_to_profiled_stream.second;
-    Executable* executable = executables[device];
-    const HloModule& module = executable->module();
-    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer_data(),
-                                    &executable->hlo_profile_index_map());
-    TF_RETURN_IF_ERROR(
-        executable->PopulateExecutionProfile(&hlo_profile, stream));
-    XLA_LOG_LINES(
-        tensorflow::INFO,
-        hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription()));
-    hlo_graph_dumper::MaybeDumpHloModule(module, "Service::Execute",
-                                         &hlo_profile);
-  }
-
   if (profile != nullptr) {
     CHECK(!timers.empty());
     std::vector<uint64> timer_nanoseconds;
@@ -752,16 +728,17 @@
   }
 
   for (int i = 0; i < executable_ptrs.size(); i++) {
-    if (executable_ptrs[i]->dumping_snapshot()) {
+    Executable* executable = executable_ptrs[i];
+    if (executable->dumping_snapshot()) {
       TF_ASSIGN_OR_RETURN(const ShapedBuffer* result_buffer,
                           allocation_tracker_.ResolveForReplica(outputs[i], 0));
       TF_ASSIGN_OR_RETURN(auto stream,
                           execute_backend_->BorrowStream(all_executors[i][0]));
       TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(),
                                       execute_backend_->transfer_manager(),
-                                      executable_ptrs[i]->hlo_snapshot()));
-      // Dump out the ith snapshot.
-      TF_RETURN_IF_ERROR(executable_ptrs[i]->DumpHloSnapshot());
+                                      executable->hlo_snapshot()));
+      DumpHloSnapshotIfEnabled(executable->module(),
+                               *executable->hlo_snapshot());
     }
   }
 
@@ -801,26 +778,9 @@
       "BuildExecutable on service %p with serialized module proto: %s", this,
       module_proto.name());
 
-  // Dump computation proto state if flag is set.
-  auto hlo_snapshot = absl::make_unique<HloSnapshot>();
-  const string& directory_path =
-      module_config->debug_options().xla_dump_computations_to();
-  const string& execution_directory_path =
-      module_config->debug_options().xla_dump_executions_to();
-  if (!directory_path.empty() || !execution_directory_path.empty()) {
-    *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto;
-    if (!directory_path.empty()) {
-      string filename = StrFormat("computation_%d__%s", module_proto.id(),
-                                  module_proto.entry_computation_name());
-      TF_RETURN_IF_ERROR(
-          Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot));
-    }
-  }
-
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       CreateModuleFromProto(module_proto, *module_config));
-
-  TF_RETURN_IF_ERROR(MaybeDumpUnoptimizedHloModule(*module));
+  DumpHloModuleIfEnabled(*module, kBeforeOptimizationsDumpName);
 
   TF_ASSIGN_OR_RETURN(
       module, backend->compiler()->RunHloPasses(std::move(module), executor,
@@ -830,7 +790,11 @@
                       backend->compiler()->RunBackend(
                           std::move(module), executor, device_allocator));
 
-  if (!execution_directory_path.empty()) {
+  const auto& debug_opts = module_config->debug_options();
+  if (DumpingEnabledForHloModule(module_proto.name(), debug_opts) &&
+      debug_opts.xla_dump_hlo_snapshots()) {
+    auto hlo_snapshot = absl::make_unique<HloSnapshot>();
+    *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto;
     executable->set_hlo_snapshot(std::move(hlo_snapshot));
   }
 
@@ -940,7 +904,7 @@
     TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(),
                                     execute_backend_->transfer_manager(),
                                     executable->hlo_snapshot()));
-    TF_RETURN_IF_ERROR(executable->DumpHloSnapshot());
+    DumpHloSnapshotIfEnabled(executable->module(), *executable->hlo_snapshot());
   }
 
   VLOG(1) << "successfully completed 'execute' request";
@@ -1162,9 +1126,7 @@
   config.set_debug_options(arg->debug_options());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       CreateModuleFromProto(arg->computation(), config));
-
-  hlo_graph_dumper::MaybeDumpHloModule(*module,
-                                       "computation statistics subject");
+  DumpHloModuleIfEnabled(*module, kBeforeOptimizationsDumpName);
 
   // Run HLO analysis to get the computation statistics.
   HloCostAnalysis analysis(
@@ -1203,16 +1165,4 @@
   return replicas;
 }
 
-Status Service::MaybeDumpUnoptimizedHloModule(const HloModule& module) const {
-  const string xla_dump_unoptimized_hlo_proto_to =
-      module.config().debug_options().xla_dump_unoptimized_hlo_proto_to();
-  if (xla_dump_unoptimized_hlo_proto_to.empty()) {
-    return Status::OK();
-  }
-  HloProto proto = MakeHloProto(module);
-  return protobuf_util::DumpProtoToDirectory(
-      proto, xla_dump_unoptimized_hlo_proto_to,
-      StrCat(module.name(), ".unoptimized"));
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index fd907d0..f127e34 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -275,10 +275,6 @@
   StatusOr<std::vector<se::StreamExecutor*>> Replicas(
       const Backend& backend, const DeviceHandle& device_handle) const;
 
-  // Dumps the (unoptimized) module given if the corresponding DebugOptions
-  // field has been set.
-  Status MaybeDumpUnoptimizedHloModule(const HloModule& module) const;
-
   // Returns the device handle that represents the replicated device for a
   // single computation that is not model-parallelized.
   DeviceHandle SingleComputationDeviceHandle() const;
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 8e4d865..e153668 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -156,14 +156,6 @@
   return Status::OK();
 }
 
-bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
-  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
-         window_dimension.padding_low() == 0 &&
-         window_dimension.padding_high() == 0 &&
-         window_dimension.window_dilation() == 1 &&
-         window_dimension.base_dilation() == 1;
-}
-
 StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                                        const Window& window,
                                        PrimitiveType element_type,
@@ -205,7 +197,8 @@
           window.DebugString());
     }
 
-    if (base_shape.is_dynamic_dimension(i) && !IsTrivialWindowDimension(dim)) {
+    if (base_shape.is_dynamic_dimension(i) &&
+        !window_util::IsTrivialWindowDimension(dim)) {
       return Unimplemented(
           "Dynamic shape is not supported for non trivial window: %s",
           window_util::ToString(window));
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 62e2b46..a12fa04 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -217,6 +217,20 @@
                                         use_threads);
 }
 
+StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
+    std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+    int64 num_replicas, DeviceAssignment* device_assignment,
+    bool run_hlo_passes, bool use_threads) {
+  HloRunner::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  options.run_hlo_passes = run_hlo_passes;
+  for (auto argument : arguments) {
+    options.arguments.push_back(argument);
+  }
+  return test_runner_.ExecuteReplicated(std::move(module), options,
+                                        device_assignment, use_threads);
+}
+
 StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
     const HloModule& test_module,
     const std::function<void(HloModule*)>& reference_preprocessor) {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index df9c29a..6c6fe34 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -182,6 +182,12 @@
       std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
       int64 num_replicas, bool use_threads);
 
+  // Same as above, but uses specified device assignment.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      int64 num_replicas, DeviceAssignment* device_assignment,
+      bool run_hlo_passes, bool use_threads);
+
   // Executes the given hlo module on two backends and compares results.
   //
   // 'arguments': the input of the hlo module.
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 841242b..352b59f 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -874,6 +874,55 @@
                            /*pad_high=*/{0, 0, 2, 0},
                            /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kMax},
+
+    // Patterns generated by cumsum/cumprod.
+    R4ReduceWindowTestData{/*base_bounds=*/{1021, 1, 16, 16},
+                           /*window_bounds=*/{1021, 1, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{1020, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{1021, 1, 16, 16},
+                           /*window_bounds=*/{1, 1, 1021, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 1020, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{16, 1, 16, 1021},
+                           /*window_bounds=*/{1, 1, 1, 1021},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 1020},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{1021, 1, 16, 16},
+                           /*window_bounds=*/{1021, 1, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{1021, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{16, 1, 1021, 16},
+                           /*window_bounds=*/{1, 1, 1021, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 1021, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{16, 1, 16, 1021},
+                           /*window_bounds=*/{1, 1, 1, 1021},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 1021},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
 };
 
 INSTANTIATE_TEST_CASE_P(
@@ -1266,9 +1315,9 @@
      /*reducer=*/Reducer::kMax},
 
     // The pattern generated by exclusive scan (cumsum/cumprod).
-    {/*base_bounds=*/{4096}, /*window_bounds=*/{4096},
+    {/*base_bounds=*/{4095}, /*window_bounds=*/{4095},
      /*strides=*/{1},
-     /*pad_low=*/{4096},
+     /*pad_low=*/{4095},
      /*pad_high=*/{0},
      /*reducer=*/Reducer::kMax},
 };
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index ebd4bb1..e9244ec 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -231,6 +231,7 @@
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
index 8460ae3..88f3a8b 100644
--- a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
+++ b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
@@ -19,7 +19,9 @@
 //
 // Reads one serilized Hlo module, convert it into JSON format and dump into
 // some output directory. some_binaray_proto is obtained by serializing Hlo
-// module to disk using --xla_dump_optimized_hlo_proto_to debug option.
+// module to disk using the debug options
+//
+//   --xla_dump_to=DIR --xla_dump_hlo_as_proto
 
 #include <stdio.h>
 #include <string>
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index 0c7c078..5652d30 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -38,6 +38,8 @@
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/tools/hlo_extractor.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/subprocess.h"
@@ -388,22 +390,18 @@
   return false;
 }
 
-void DisplayGraphHandle(const Options &opts, const string& handle) {
-  std::cout << handle << std::endl;
+void OpenUrl(const Options& opts, absl::string_view url) {
+  std::cout << url << std::endl;
 
   // If it is a url, try to open it up in the user's browser too.
-  if (absl::StartsWithIgnoreCase(handle, "http://") ||
-      absl::StartsWithIgnoreCase(handle, "https://") ||
-      absl::StartsWithIgnoreCase(handle, "file://")) {
+  if (absl::StartsWithIgnoreCase(url, "http://") ||
+      absl::StartsWithIgnoreCase(url, "https://") ||
+      absl::StartsWithIgnoreCase(url, "file://")) {
     const char* browser_bin = opts.browser.empty() ? "/usr/bin/sensible-browser"
                                                    : opts.browser.c_str();
     tensorflow::SubProcess p;
-    p.SetProgram(browser_bin, {browser_bin, handle});
+    p.SetProgram(browser_bin, {browser_bin, string(url)});
     p.Start();
-  } else if (handle.empty()) {
-    std::cerr << "Unable to render graph, perhaps due to graphviz server "
-                 "timeout.  Run with --logtostderr to see."
-              << std::endl;
   } else {
     std::cerr << "\nExpected a URL, but got strange graph result (dumped "
                  "above).  If this isn't what you expected, maybe file a bug?"
@@ -411,6 +409,65 @@
   }
 }
 
+// Renders a graph by calling `renderer`, and then tries to open it.
+//
+// `renderer` is a callback so we can try various formats.  In particular, the
+// URL format doesn't work out of the box; it requires you to register a plugin.
+void RenderAndDisplayGraph(
+    const Options& opts,
+    const std::function<StatusOr<string>(RenderedGraphFormat)>& renderer) {
+  StatusOr<string> url_result = renderer(RenderedGraphFormat::kUrl);
+  if (url_result.ok()) {
+    string url = url_result.ValueOrDie();
+    OpenUrl(opts, url);
+    return;
+  }
+
+  // Ignore UNAVAILABLE errors; these are expected when there's no URL renderer
+  // plugin registered.
+  if (url_result.status().code() != tensorflow::error::UNAVAILABLE) {
+    std::cerr << "Unable to render graph as URL: " << url_result.status()
+              << std::endl;
+    std::cerr << "Trying as HTML..." << std::endl;
+  }
+
+  auto* env = tensorflow::Env::Default();
+  StatusOr<string> html_result = renderer(RenderedGraphFormat::kHtml);
+  if (!html_result.ok()) {
+    std::cerr << "Failed to render graph as HTML: " << html_result.status()
+              << std::endl;
+    return;
+  }
+
+  std::vector<string> temp_dirs;
+  env->GetLocalTempDirectories(&temp_dirs);
+  if (temp_dirs.empty()) {
+    std::cerr << "Can't render graph as HTML because we can't find a suitable "
+                 "temp directory.  Try setting $TMPDIR?"
+              << std::endl;
+    return;
+  }
+
+  // Try to create a unique file inside of temp_dirs.front().  Notably, this
+  // file's name must end with ".html", otherwise web browsers will treat it as
+  // plain text, so we can't use Env::CreateUniqueFileName().
+  string temp_file_path = tensorflow::io::JoinPath(
+      temp_dirs.front(),
+      absl::StrFormat("interactive_graphviz.%d.html", env->NowMicros()));
+  auto status = tensorflow::WriteStringToFile(
+      env, temp_file_path, std::move(html_result).ValueOrDie());
+  if (status.ok()) {
+    OpenUrl(opts, absl::StrCat("file://", temp_file_path));
+    return;
+  }
+
+  std::cerr << "Failed to write rendered HTML graph to " << temp_file_path
+            << ": " << status;
+
+  // We don't bother trying kDot, because kHTML should always work (or if it
+  // doesn't, we don't have any reason to believe kDot will work better).
+}
+
 void DoAllPathsCommand(const Options& opts, const HloModule& module,
                        const std::vector<string>& tokens) {
   if (tokens.size() > 4) {
@@ -451,8 +508,10 @@
     std::cerr << "No path from/to " << tokens[1] << " to/from " << tokens[2];
     return;
   }
-  DisplayGraphHandle(opts, hlo_graph_dumper::DumpAllPathsFromTo(
-      *from, *to, max_nodes, /*show_backend_config=*/show_backend_config));
+  RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
+    return RenderAllPathsFromTo(*from, *to, max_nodes, format,
+                                /*show_backend_config=*/show_backend_config);
+  });
 }
 
 // Plot a given instruction neighborhood or computation with graphviz.
@@ -513,14 +572,19 @@
   // Generate the graph and print the resulting string, which should be a
   // graphviz url.
   if (comp) {
-    DisplayGraphHandle(opts, hlo_graph_dumper::DumpGraph(
-        *comp, "", comp->parent()->config().debug_options(), nullptr,
-        /*show_backend_config=*/show_backend_config));
+    RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
+      return RenderGraph(*comp, /*label=*/"",
+                         comp->parent()->config().debug_options(), format,
+                         /*hlo_execution_profile=*/nullptr,
+                         /*show_backend_config=*/show_backend_config);
+    });
   } else {
-    DisplayGraphHandle(opts, hlo_graph_dumper::DumpNeighborhoodAround(
-                                 *instr, graph_width,
-                                 /*show_backend_config=*/show_backend_config,
-                                 /*boundary=*/boundary));
+    RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
+      return RenderNeighborhoodAround(
+          *instr, graph_width, format,
+          /*show_backend_config=*/show_backend_config,
+          /*boundary=*/boundary);
+    });
   }
 }
 
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index e001cc3..f2e1831 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -204,6 +204,14 @@
          window_dim.padding_low() == 0 && window_dim.padding_high() == 0;
 }
 
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
+  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
+         window_dimension.padding_low() == 0 &&
+         window_dimension.padding_high() == 0 &&
+         window_dimension.window_dilation() == 1 &&
+         window_dimension.base_dilation() == 1;
+}
+
 int64 DilatedBound(int64 bound, int64 dilation) {
   CHECK_GE(bound, 0);
   CHECK_GE(dilation, 1);
diff --git a/tensorflow/compiler/xla/window_util.h b/tensorflow/compiler/xla/window_util.h
index 099d7ec..e709928 100644
--- a/tensorflow/compiler/xla/window_util.h
+++ b/tensorflow/compiler/xla/window_util.h
@@ -62,6 +62,10 @@
 // has window bound 1, no striding and no padding.
 bool IsInactiveWindowDimension(const Window& window, int64 logical_dim);
 
+// Returns true if the provided window dimension is trivial (inactive and has no
+// dilation)
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension);
+
 // Returns the new bound after dilation.
 //
 // If a window with the given bound in some dimension is dilated with the given
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 925fcbf..8799296 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -61,40 +61,12 @@
 // Debugging options for XLA. These options may change at any time - there are
 // no guarantees about backward or forward compatibility for these fields.
 message DebugOptions {
-  // HLO modules matching this regex will be dumped to a .dot file throughout
-  // various stages in compilation (file names are LOG(INFO)'d). Set to ".*" to
-  // dump *all* HLO modules.
-  string xla_generate_hlo_graph = 1;
-
   // Show addresses of HLO ops in graph dump.
   bool xla_hlo_graph_addresses = 2;
 
-  // Path to dump HLO graphs to.
-  string xla_hlo_graph_path = 4;
-
-  reserved 5;  // Was xla_hlo_dump_as_graphdef
-
-  // HLO modules matching this regex will be dumped to LOG(INFO). Set to ".*" to
-  // dump *all* HLO modules.
-  string xla_log_hlo_text = 6;
-
-  // Dump all HLO modules as text into the provided directory path.
-  string xla_generate_hlo_text_to = 7;
-
-  // Dump Hlo after all hlo passes are executed as proto binary into this
-  // directory.
-  string xla_dump_optimized_hlo_proto_to = 8;
-
   // Instrument the computation to collect per-HLO cycle counts.
   bool xla_hlo_profile = 9;
 
-  // Dumps computations that XLA executes into the provided directory path.
-  string xla_dump_computations_to = 10;
-
-  // Dumps parameters and results of computations that XLA executes into the
-  // provided directory path.
-  string xla_dump_executions_to = 11;
-
   // List of HLO passes to disable. These names must exactly match the pass
   // names as specified by the HloPassInterface::name() method.
   repeated string xla_disable_hlo_passes = 30;
@@ -114,9 +86,6 @@
   // Embed the compiler IR as a string in the executable.
   bool xla_embed_ir_in_executable = 33;
 
-  // Dump the compiler IR into this directory as individual files.
-  string xla_dump_ir_to = 34;
-
   // Eliminate implicit broadcasts when lowering user computations to HLO
   // instructions; use explicit broadcast instead.
   bool xla_eliminate_hlo_implicit_broadcast = 35;
@@ -176,14 +145,6 @@
   // ops.
   bool xla_gpu_use_cudnn_batchnorm = 94;
 
-  // Dump HLO before any hlo passes are executed as proto binary into this
-  // directory.
-  string xla_dump_unoptimized_hlo_proto_to = 95;
-
-  // Dump HLO after each pass as an HloProto in binary file format into this
-  // directory.
-  string xla_dump_per_pass_hlo_proto_to = 96;
-
   // Generate calls to MKL-DNN in the CPU backend.
   bool xla_cpu_use_mkl_dnn = 97;
 
@@ -221,9 +182,6 @@
   // If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3).
   bool xla_gpu_disable_ptxas_optimizations = 103;
 
-  // Dump HLO graphs as an HTML (DOT -> SVG inlined in HTML)
-  bool xla_hlo_dump_as_html = 105;
-
   // Enable fast math with eigen in the HLO evaluator.
   bool xla_hlo_evaluator_use_fast_path = 106;
 
@@ -247,11 +205,59 @@
   // value.
   StepMarkerLocation xla_step_marker_location = 108;
 
-  // Next id: 109
+  //
+  // BEGIN flags controlling dumping HLO modules for debugging.
+  //
+  // When dumping is enabled, HLO modules dumped at the very beginning and end
+  // of compilation, and optionally also during the pass pipeline.
+  //
+  // In general, if you set one of these flags, we will try to infer reasonable
+  // defaults for the others.  For example:
+  //
+  //  * Setting --xla_dump_to=/tmp/foo without specifying a format
+  //    with --xla_dump_hlo_as_* will turn on --xla_dump_hlo_as_text.
+  //
+  //  * Setting --xla_dump_hlo_as_text without specifying --xla_dump_to will
+  //    dump to stdout.
+  //
+
+  // Directory to dump into.
+  string xla_dump_to = 109;
+
+  // If specified, will only dump modules which match this regexp.
+  string xla_dump_hlo_module_re = 110;
+
+  // If this flag is specified, will also HLO before and after passes that match
+  // this regular expression.  Set to .* to dump before/after all passes.
+  string xla_dump_hlo_pass_re = 111;
+
+  // Specifies the format that HLO is dumped in.  Multiple of these may be
+  // specified.
+  bool xla_dump_hlo_as_text = 112;
+  bool xla_dump_hlo_as_proto = 113;
+  bool xla_dump_hlo_as_dot = 114;
+  bool xla_dump_hlo_as_url = 115;
+
+  // Dump HLO graphs as an HTML (DOT -> SVG inlined in HTML)
+  bool xla_dump_hlo_as_html = 116;
+
+  // If true, every time an HLO module is run, we will dump an HloSnapshot
+  // (essentially, a serialized module plus its inputs) to the --xla_dump_to
+  // directory.
+  bool xla_dump_hlo_snapshots = 118;
+
+  //
+  // END flags controlling dumping HLO modules.
+  //
+
+  // Next id: 119
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
+
+  reserved 117;  // was xla_dump_to
+  reserved 5;    // Was xla_hlo_dump_as_graphdef
 }
 
 // These settings control how XLA compiles and/or runs code.  Not all settings
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index 3ef8bed..8b7749b 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -55,21 +55,14 @@
     return ref_options;
   }
   xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
-  options.set_xla_generate_hlo_text_to(
-      SafeDebugPath(ref_options.xla_generate_hlo_text_to()));
-  options.set_xla_dump_optimized_hlo_proto_to(
-      SafeDebugPath(ref_options.xla_dump_optimized_hlo_proto_to()));
-  options.set_xla_dump_computations_to(
-      SafeDebugPath(ref_options.xla_dump_computations_to()));
-  options.set_xla_dump_executions_to(
-      SafeDebugPath(ref_options.xla_dump_executions_to()));
+  options.set_xla_dump_to(SafeDebugPath(ref_options.xla_dump_to()));
+  options.set_xla_dump_hlo_as_proto(ref_options.xla_dump_hlo_as_proto());
+  options.set_xla_dump_hlo_as_text(ref_options.xla_dump_hlo_as_text());
+  options.set_xla_dump_hlo_snapshots(ref_options.xla_dump_hlo_snapshots());
+  options.set_xla_dump_hlo_pass_re(ref_options.xla_dump_hlo_pass_re());
   for (auto& pass : ref_options.xla_disable_hlo_passes()) {
     options.add_xla_disable_hlo_passes(pass);
   }
-  options.set_xla_dump_unoptimized_hlo_proto_to(
-      SafeDebugPath(ref_options.xla_dump_unoptimized_hlo_proto_to()));
-  options.set_xla_dump_per_pass_hlo_proto_to(
-      SafeDebugPath(ref_options.xla_dump_per_pass_hlo_proto_to()));
   return options;
 }
 
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 60ee1b4..9e9d85d 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -294,11 +294,12 @@
     `CMakeLists.txt` and the c++ file `main.cxx`
 2.  Fill in the `main.cxx` with the code provided in
     [official c++ api basic](https://www.tensorflow.org/api_guides/cc/guide).
-3.  Fill in the `CMakeLists.txt` with following code: ``` cmake
+3.  Fill in the `CMakeLists.txt` with following code:
+
+    ```cmake
     cmake_minimum_required (VERSION 2.6) project (tf_hello)
 
     # Tensorflow
-
     find_package(Tensorflow REQUIRED)
     include_directories(${TENSORFLOW_INCLUDE_DIRS})
 
@@ -314,7 +315,8 @@
     this CMakeList.txt, under development") endif()
 
     add_executable(tf_hello main.cxx) target_link_libraries(tf_hello
-    ${TENSORFLOW_LIBRARIES}) ```
+    ${TENSORFLOW_LIBRARIES})
+    ```
 
 4.  Configure the folder with cmake-gui, an error should be prompted out,
     requesting you to locate the folder containing `TensorflowConfig.cmake`.
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 123aaf7..0f683a8 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -254,6 +254,33 @@
     ],
 )
 
+cuda_py_test(
+    name = "keras_multi_worker_test",
+    srcs = ["keras_multi_worker_test.py"],
+    additional_deps = [
+        "//tensorflow/contrib/distribute/python:collective_all_reduce_strategy",
+        "//tensorflow/contrib/distribute/python:combinations",
+        "//tensorflow/contrib/distribute/python:mirrored_strategy",
+        "//tensorflow/contrib/distribute/python:multi_worker_test_base",
+        "//tensorflow/contrib/distribute/python:parameter_server_strategy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_config",
+        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:engine",
+    ],
+    shard_count = 3,
+    tags = [
+        # TODO(b/124344198): Add "multi_and_single_gpu",
+    ],
+)
+
 py_library(
     name = "step_fn",
     srcs = ["step_fn.py"],
diff --git a/tensorflow/contrib/distribute/python/keras_multi_worker_test.py b/tensorflow/contrib/distribute/python/keras_multi_worker_test.py
new file mode 100644
index 0000000..5a0625a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_multi_worker_test.py
@@ -0,0 +1,460 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test multi-worker Keras.
+
+TODO(b/123845258): Move this to tensorflow core.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import functools
+import os
+import sys
+import threading
+
+from absl.testing import parameterized
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.contrib.distribute.python import collective_all_reduce_strategy as collective_strategy
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import multi_worker_test_base as test_base
+from tensorflow.contrib.distribute.python import parameter_server_strategy
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import models
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+def _mnist_synthetic_dataset(batch_size, steps_per_epoch):
+  # train dataset
+  x_train = array_ops.ones([batch_size * steps_per_epoch, 28, 28, 1],
+                           dtype=dtypes.float32)
+  y_train = array_ops.ones([batch_size * steps_per_epoch, 1],
+                           dtype=dtypes.int32)
+  train_ds = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+  train_ds = train_ds.repeat()
+  # train_ds = train_ds.shuffle(100)
+  train_ds = train_ds.batch(64, drop_remainder=True)
+
+  # eval dataset
+  x_test = random_ops.random_uniform([10000, 28, 28, 1], dtype=dtypes.float32)
+  y_test = random_ops.random_uniform([10000, 1],
+                                     minval=0,
+                                     maxval=9,
+                                     dtype=dtypes.int32)
+  eval_ds = dataset_ops.Dataset.from_tensor_slices((x_test, y_test))
+  eval_ds = eval_ds.repeat()
+  eval_ds = eval_ds.batch(64, drop_remainder=True)
+
+  return train_ds, eval_ds
+
+
+def _get_model(input_shape):
+  # Define a deterministically-initialized CNN model to recognize MNIST digits,
+  # commented out several layers to simplify it.
+  model = keras.models.Sequential()
+  model.add(
+      keras.layers.Conv2D(
+          32,
+          kernel_size=(3, 3),
+          activation='relu',
+          input_shape=input_shape,
+          kernel_initializer=keras.initializers.TruncatedNormal(seed=99)))
+  # model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
+  # model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
+  # model.add(keras.layers.Dropout(0.25))
+  model.add(keras.layers.Flatten())
+  # model.add(keras.layers.Dense(128, activation='relu'))
+  # model.add(keras.layers.Dropout(0.5))
+  model.add(
+      keras.layers.Dense(
+          10,
+          activation='softmax',
+          kernel_initializer=keras.initializers.TruncatedNormal(seed=99)))
+
+  # TODO(yuefengz): optimizer with slot variables doesn't work because of
+  # optimizer's bug.
+  # TODO(yuefengz): we should not allow non-v2 optimizer.
+  model.compile(
+      loss=keras.losses.sparse_categorical_crossentropy,
+      optimizer=gradient_descent.SGD(learning_rate=0.001),
+      metrics=['accuracy'])
+  return model
+
+
+def _clone_and_build_model(model, strategy):
+  # The new "original" model in worker 0.
+  with strategy.scope():
+    cloned_model = models.clone_model(model)
+
+  # Compile and build model.
+  if isinstance(model.optimizer, optimizers.TFOptimizer):
+    optimizer = model.optimizer
+    # TODO(yuefengz): figure out why the optimizer here is still a
+    # TFOptimizer.
+    while isinstance(optimizer, optimizers.TFOptimizer):
+      optimizer = optimizer.optimizer
+    optimizer = copy.deepcopy(optimizer)
+  else:
+    optimizer_config = model.optimizer.get_config()
+    optimizer = type(model.optimizer).from_config(optimizer_config)
+
+  cloned_model.compile(
+      optimizer,
+      model.loss,
+      metrics=metrics_module.clone_metrics(model._compile_metrics),
+      loss_weights=model.loss_weights,
+      sample_weight_mode=model.sample_weight_mode,
+      weighted_metrics=metrics_module.clone_metrics(
+          model._compile_weighted_metrics))
+  return cloned_model
+
+
+# TODO(b/123918215): Possibly merge this Callback with keras_test.Counter.
+class MultiWorkerVerificationCallback(callbacks.Callback):
+  """MultiWorkerVerificationCallback verifies the callbacks in multi-worker scheme.
+
+  This Callback is intended to be used for verifying the callback is indeed
+  called the correct number of times in various task types.
+
+  Attributes:
+    _task_dict: A nested dictionary storing the number of times a callback has
+                been called in specific task type, task index, and method name.
+                Look up structure is
+                task_name -> task_id -> tracking_method_name -> invoke_count
+                For example, a _task_dict of
+                {
+                    'ps': {
+                         0: {
+                             'on_epoch_begin': 2
+                         },
+                         1: {
+                             'on_epoch_begin': 2
+                         }
+                    },
+                    'worker': {
+                         0: {
+                             'on_epoch_begin': 2
+                         },
+                         1: {
+                             'on_epoch_begin': 2
+                         }
+                    }
+                }
+                indicates the ps task has 'on_epoch_begin' called twice on each
+                of the two indices, and likewise for worker task.
+  """
+
+  # TODO(rchao): Add other method calls to verify.
+  METHODS_TO_VERIFY = ['on_epoch_begin']
+
+  def __init__(self, num_epoch, num_worker):
+    """Initialize a MultiWorkerVerificationCallback.
+
+    Args:
+      num_epoch: Number of epochs this Callback is expected to be called for.
+      num_worker: Number of workers this Callback is expected to be called from.
+    """
+    super(MultiWorkerVerificationCallback, self).__init__()
+    self._num_epoch = num_epoch
+    self._num_worker = num_worker
+    self._task_dict = {
+        key: collections.defaultdict(lambda: collections.defaultdict(int))
+        for key in ['ps', 'worker']
+    }
+    self._lock = threading.Lock()
+    self._is_between_graph = None
+    self.wrap_methods(self.METHODS_TO_VERIFY)
+
+  @property
+  def is_between_graph(self):
+    return self._is_between_graph
+
+  @is_between_graph.setter
+  def is_between_graph(self, is_between_graph):
+    self._is_between_graph = is_between_graph
+
+  def wrap_methods(self, method_names):
+    """Wrap methods so that the counts of calls are tracked.
+
+    Args:
+      method_names: A list of names of methods to track calls.
+    """
+    for method_name in method_names:
+      method = getattr(self, method_name)
+
+      def wrapped_method(method_to_wrap, name, *arg, **kwargs):
+        # Use lock to ensure += operation is thread-safe.
+        with self._lock:
+          self._task_dict[test_base.get_task_type()][
+              test_base.get_task_index()][name] += 1
+        method_to_wrap(*arg, **kwargs)
+
+      setattr(self, method_name,
+              functools.partial(wrapped_method, method, method_name))
+
+  def verify(self, test_case):
+    method_count_dict = {
+        method_name: self._num_epoch for method_name in self.METHODS_TO_VERIFY
+    }
+    assert self._is_between_graph is not None
+    if self._is_between_graph:
+      # TODO(b/124171024): In between-graph replication, by default only the
+      # chief calls callback. Fix this test to cover that, as well as the rare
+      # cases where all workers call.
+      worker_call_count = {
+          i: method_count_dict for i in range(0, self._num_worker)
+      }
+    else:
+      # If in-graph, only the first worker calls callback methods.
+      worker_call_count = {0: method_count_dict}
+    test_case.assertDictEqual(
+        self._task_dict,
+        {
+            # PS' callback is not supposed to be called.
+            'ps': {},
+            # Each of the Worker should be called num_epoch of times.
+            'worker': worker_call_count
+        })
+
+
+# TODO(yuefengz): right now, fit or evaluate has to be called under distribution
+# strategy's scope.
+def _run_standalone_client(test_obj, strategy, cluster_spec):
+  input_shape = (28, 28, 1)
+  with strategy.scope():
+    orig_model = _get_model(input_shape)
+
+  def worker_fn(strategy):
+    with ops.Graph().as_default():
+      batch_size = 64
+      steps = 10
+
+      with strategy.scope():
+        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        model = _clone_and_build_model(orig_model, strategy)
+
+        orig_loss, orig_acc = model.evaluate(train_ds, steps=steps)
+
+        # Workaround for the metrics issue (b/122928955) in async training. This
+        # can only be used in standalone client mode.
+        dc_context.get_current_worker_context().wait_for_other_workers()
+
+        model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
+
+        dc_context.get_current_worker_context().wait_for_other_workers()
+
+        trained_loss, trained_acc = model.evaluate(train_ds, steps=steps)
+
+      test_obj.assertLessEqual(trained_loss, orig_loss)
+      test_obj.assertGreaterEqual(trained_acc, orig_acc)
+
+  dc.run_distribute_coordinator(
+      worker_fn,
+      strategy,
+      mode=dc.CoordinatorMode.STANDALONE_CLIENT,
+      cluster_spec=cluster_spec)
+
+
+def get_strategy_object(strategy_cls):
+  if (strategy_cls == mirrored_strategy.MirroredStrategy or
+      strategy_cls == mirrored_strategy.CoreMirroredStrategy):
+    return strategy_cls(mirrored_strategy.all_local_devices())
+  else:
+    return strategy_cls(num_gpus_per_worker=context.num_gpus())
+
+
+class KerasMultiWorkerTestStandaloneClient(test.TestCase,
+                                           parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 2 workers."""
+    super(KerasMultiWorkerTestStandaloneClient, cls).setUpClass()
+    cls._cluster_spec = test_base.create_in_process_cluster(
+        num_workers=2, num_ps=1, has_eval=False)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
+              parameter_server_strategy.ParameterServerStrategy,
+              collective_strategy.CollectiveAllReduceStrategy,
+          ],
+          required_gpus=[0, 1]))
+  def testSimpleModelStandaloneClient(self, strategy_cls):
+    # With standalone client, training_utils.should_run_multi_worker returns
+    # False which means the distribute coordinator won't be called again in
+    # `fit`. This is still correct and intended since session is still
+    # configured under distribute coordinator's worker context and distribution
+    # strategy object is already configured by distribute coordinator for
+    # multi-worker training.
+    # The logic should be much clearer once standalone client is merged into
+    # core Keras as well.
+    strategy = get_strategy_object(strategy_cls)
+
+    _run_standalone_client(self, strategy, self._cluster_spec)
+
+
+class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
+                                            parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
+              collective_strategy.CollectiveAllReduceStrategy,
+          ],
+          required_gpus=[0, 1]))
+  def testSimpleModelIndependentWorkerSync(self, strategy_cls):
+    num_workers = 2
+    num_epoch = 2
+
+    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
+    self._barrier = dc._Barrier(2)
+
+    # The verification callback will be shared by multiple threads.
+    verification_callback = MultiWorkerVerificationCallback(
+        num_epoch=num_epoch, num_worker=num_workers)
+
+    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
+      """Simulates an Independent Worker inside of a thread."""
+      with test.mock.patch.object(dc, '_run_std_server',
+                                  self._make_mock_run_std_server()):
+        strategy = get_strategy_object(strategy_cls)
+        verification_callback.is_between_graph = \
+            strategy.extended.experimental_between_graph
+        batch_size = 64
+        steps = 10
+        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        with strategy.scope():
+          model = _get_model((28, 28, 1))
+        orig_loss, _ = model.evaluate(train_ds, steps=steps)
+        callbacks_for_fit = nest.flatten(
+            kwargs.get('verification_callback', []))
+        history = model.fit(
+            x=train_ds,
+            epochs=num_epoch,
+            steps_per_epoch=steps,
+            callbacks=callbacks_for_fit)
+        self.assertIsInstance(history, keras.callbacks.History)
+        trained_loss, _ = model.evaluate(train_ds, steps=steps)
+        self.assertLess(trained_loss, orig_loss)
+
+    threads = self.run_multiple_tasks_in_threads(
+        _independent_worker_fn,
+        cluster_spec,
+        verification_callback=verification_callback)
+
+    threads_to_join = []
+    strategy = get_strategy_object(strategy_cls)
+    if strategy.extended.experimental_between_graph:
+      for ts in threads.values():
+        threads_to_join.extend(ts)
+    else:
+      threads_to_join = [threads['worker'][0]]
+    self.join_independent_workers(threads_to_join)
+    verification_callback.verify(self)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          strategy_cls=[parameter_server_strategy.ParameterServerStrategy],
+          required_gpus=[0, 1]))
+  def testSimpleModelIndependentWorkerAsync(self, strategy_cls):
+    num_workers = 2
+    num_epoch = 2
+    cluster_spec = test_base.create_cluster_spec(
+        num_workers=num_workers, num_ps=2)
+    self._barrier = dc._Barrier(4)
+
+    # The verification callback will be shared by multiple threads.
+    verification_callback = MultiWorkerVerificationCallback(
+        num_epoch=num_epoch, num_worker=num_workers)
+
+    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
+      """Simulates an Independent Worker inside of a thread."""
+      # TODO(rchao/yuefengz): The following is run by both worker and ps
+      # threads. The distribute coordinator should run std server immediately
+      # without configuring the session (or building the graph) on PS.
+      with test.mock.patch.object(dc, '_run_std_server',
+                                  self._make_mock_run_std_server()):
+        batch_size = 64
+        steps = 10
+        strategy = strategy_cls(num_gpus_per_worker=context.num_gpus())
+        verification_callback.is_between_graph = \
+            strategy.extended.experimental_between_graph
+
+        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        val_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
+        with strategy.scope():
+          model = _get_model((28, 28, 1))
+
+          # TODO(b/123868066): Verify callback for model.evaluate().
+          callbacks_for_fit = nest.flatten(
+              kwargs.get('verification_callback', []))
+          history = model.fit(
+              x=train_ds,
+              epochs=num_epoch,
+              steps_per_epoch=steps,
+              validation_data=val_ds,
+              validation_steps=steps,
+              callbacks=callbacks_for_fit)
+        self.assertIsInstance(history, keras.callbacks.History)
+
+    threads = self.run_multiple_tasks_in_threads(
+        _independent_worker_fn,
+        cluster_spec,
+        verification_callback=verification_callback)
+
+    threads_to_join = []
+    for task_type, ts in threads.items():
+      # This test can finish once the worker threads complete, and thus
+      # the ps threads don't need to be joined.
+      if task_type == 'ps':
+        continue
+      threads_to_join.extend(ts)
+    self.join_independent_workers(threads_to_join)
+    verification_callback.verify(self)
+
+
+if __name__ == '__main__':
+  # Enable manual variable initialization to make sure variables are initialized
+  # by `init_restore_or_wait_for_variables`.
+  backend.manual_variable_initialization(True)
+  with test.mock.patch.object(sys, 'exit', os._exit):
+    test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_utils_test.py b/tensorflow/contrib/distribute/python/keras_utils_test.py
index 371cb18..da17722 100644
--- a/tensorflow/contrib/distribute/python/keras_utils_test.py
+++ b/tensorflow/contrib/distribute/python/keras_utils_test.py
@@ -33,7 +33,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras.engine import distributed_training_utils
-from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rms_prop_keras
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import gradient_descent
 
@@ -497,14 +497,14 @@
       dataset = keras_test_lib.get_dataset(distribution)
       with distribution.scope():
         model = keras_test_lib.get_model()
-        model.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
         model.fit(dataset, epochs=1, steps_per_epoch=1)
 
         weights_file = tempfile.mktemp('.h5')
         model.save_weights(weights_file)
 
         model_2 = keras_test_lib.get_model()
-        model_2.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model_2.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
         model_2.load_weights(weights_file)
         model_2.predict(
             keras_test_lib.get_predict_dataset(distribution), steps=2)
@@ -512,40 +512,23 @@
 
   @combinations.generate(
       keras_test_lib.all_strategy_combinations_minus_default())
-  def test_save_load_trackable_optimizer_v1(self, distribution):
-    with self.cached_session():
-      dataset = keras_test_lib.get_dataset(distribution)
-      with distribution.scope():
-        model = keras_test_lib.get_model()
-        model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-        model.fit(dataset, epochs=1, steps_per_epoch=1)
-
-        weights_file = tempfile.mktemp()
-        model.save_weights(weights_file)
-
-        model_2 = keras_test_lib.get_model()
-        model_2.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-        model_2.load_weights(weights_file)
-        model_2.predict(
-            keras_test_lib.get_predict_dataset(distribution), steps=2)
-        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
-
-  @combinations.generate(
-      keras_test_lib.all_strategy_minus_default_and_tpu_combinations())
-  def test_save_load_trackable_optimizer_v2(self, distribution):
+  def test_save_load_trackable(self, distribution):
     # TODO(b/123533246): Enable the test for TPU once bug is fixed
+    if (isinstance(distribution, tpu_strategy.TPUStrategy) and
+        distribution.extended.steps_per_run > 1):
+      self.skipTest('MultiStep TPU Strategy deadlocks with optimizer restore.')
     with self.cached_session():
       dataset = keras_test_lib.get_dataset(distribution)
       with distribution.scope():
         model = keras_test_lib.get_model()
-        model.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
         model.fit(dataset, epochs=1, steps_per_epoch=1)
 
         weights_file = tempfile.mktemp()
         model.save_weights(weights_file)
 
         model_2 = keras_test_lib.get_model()
-        model_2.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model_2.compile(rms_prop_keras.RMSprop(learning_rate=0.01), 'mse')
         model_2.load_weights(weights_file)
         model_2.predict(
             keras_test_lib.get_predict_dataset(distribution), steps=2)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
index 232620e..66a464d 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
@@ -135,7 +135,7 @@
     encoder_input = keras.layers.Embedding(
         vocab, embedding_dim, mask_zero=True)(
             inputs)
-    encoder_output = keras.layers.UnifiedLSTM(
+    encoder_output = keras.layers.LSTM(
         self.memory_size, return_sequences=True)(
             encoder_input)
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index e05f040..411a080 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -63,7 +63,15 @@
 # //tensorflow/tensorflow.bzl) will include the necessary symbols in binary
 # build targets.
 
+package_group(
+    name = "dependency_whitelist",
+    packages = [
+        "//learning/freud/topic_models/tensorflow/...",
+    ],
+)
+
 package(default_visibility = [
+    ":dependency_whitelist",
     "//tensorflow:internal",
     "//tensorflow_models:__subpackages__",
 ])
diff --git a/tensorflow/core/api_def/base_api/api_def_NonDeterministicInts.pbtxt b/tensorflow/core/api_def/base_api/api_def_NonDeterministicInts.pbtxt
new file mode 100644
index 0000000..0a55267
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NonDeterministicInts.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "NonDeterministicInts"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Non-deterministic integer values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Non-deterministically generates some integers."
+  description: <<END
+This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
+END
+}
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 2019530..1eec6d4 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -86,10 +86,10 @@
 //
 // The passed in *handle will be Unreffed if it is replaced.
 //
-// `op_device` is passed in explicitly because `op->device()` might be unset
-// and we might have selected some specific device to run this op on.
+// `op_device_name` is passed in explicitly because `op->device()` might be
+// unset and we might have selected some specific device to run this op on.
 Status MaybeCopyInputToExpectedDevice(EagerOperation* op,
-                                      const Device* op_device, int i,
+                                      const string& op_device_name, int i,
                                       const Device* expected_input_device,
                                       RunMetadata* run_metadata,
                                       TensorHandle** handle) {
@@ -115,7 +115,7 @@
             " cannot compute ",
             op->Name(), " as input #", i, " was expected to be on ",
             expected_input_device->name(), " but is actually on ",
-            actual_device->name(), " (operation running on ", op_device->name(),
+            actual_device->name(), " (operation running on ", op_device_name,
             ")",
             " Tensors can be copied explicitly using .gpu() or .cpu() "
             "methods,"
@@ -128,7 +128,7 @@
                      << " was expected to be on "
                      << expected_input_device->name() << " but is actually on "
                      << actual_device->name() << " (operation running on "
-                     << op_device->name()
+                     << op_device_name
                      << "). This triggers a copy which can be a performance "
                         "bottleneck.";
         break;
@@ -174,7 +174,11 @@
   return Status::OK();
 }
 
-Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
+// `op_device_name` the name of the device on which the op will run, if any.
+// For functions running using function library runtime, the device can be
+// unspecified.
+Status ValidateInputTypeAndPlacement(EagerContext* ctx,
+                                     const string& op_device_name,
                                      EagerOperation* op,
                                      const KernelAndDevice* kernel,
                                      RunMetadata* run_metadata) {
@@ -185,7 +189,7 @@
   for (int i = 0; i < op->Inputs().size(); ++i) {
     const Device* expected_device = kernel->InputDevice(i);
     TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-        op, op_device, i, expected_device, run_metadata,
+        op, op_device_name, i, expected_device, run_metadata,
         &((*op->MutableInputs())[i])));
     tensorflow::TensorHandle* handle = op->Inputs()[i];
     if (handle->dtype != kernel->input_type(i)) {
@@ -345,16 +349,31 @@
   return Status::OK();
 }
 
+// There are a lot of references to devices in this function and around.
+// Here is what they mean:
+//  EagerOperation::Device(): The device on which the user requested the op
+//    be executed, except if we had to change the device due to resource inputs
+//    or CPU pinning. If the user did not request a device, the op does not
+//    take resources, and we did not pin it to CPU, the device can be nullptr.
+//  KernelAndDevice::Device(): The first time we see an op (combined with
+//    its attributes), we need to create a KernelAndDevice object for it.
+//    If op->Device() is a nullptr, we select a device for the op when
+//    creating the KernelAndDevice. A concrete device will always be selected
+//    here except when `op` is a function to be executed using function library
+//    runtime. In this case, we don't select a device because running
+//    a function with explicitly requested device has different behavior than
+//    running without an explicitly requested device.
 Status EagerLocalExecute(EagerOperation* op,
                          gtl::InlinedVector<TensorHandle*, 2>* retvals,
                          int* num_retvals) {
+  const string unspecified_device_name("<unspecified>");
   EagerContext* ctx = op->EagerContext();
   auto status = ctx->GetStatus();
   if (!status.ok()) return status;
   Device* device = op->Device();
 
   const string& maybe_unspecified_device_name =
-      device == nullptr ? "unspecified" : device->name();
+      device == nullptr ? unspecified_device_name : device->name();
   Fprint128 cache_key =
       op->MutableAttrs()->CacheKey(maybe_unspecified_device_name);
 
@@ -382,26 +401,30 @@
       op->MutableAttrs()->Set(kXlaCompileAttr, true);
       compile_with_xla = true;
     }
+    bool run_function_with_flr = is_multi_device_function && !compile_with_xla;
 
     const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
-    if (device == nullptr) {
+    if (!run_function_with_flr && device == nullptr) {
       status = SelectDevice(ndef, ctx, &device);
       if (!status.ok()) return status;
     }
+    const string& device_name =
+        device == nullptr ? unspecified_device_name : device->name();
     if (ctx->LogDevicePlacement()) {
-      printf("Executing op %s in device %s\n", ndef.op().c_str(),
-             device->name().c_str());
-      LOG(INFO) << "Executing op " << ndef.op() << " in device "
-                << device->name();
+      LOG(INFO) << "Executing op " << ndef.op() << " in device " << device_name;
+    } else {
+      VLOG(1) << "Executing op " << ndef.op() << " in device " << device_name;
     }
 
-    auto* flr = ctx->func_lib(device);
-    if (flr == nullptr) {
+    FunctionLibraryRuntime* flr =
+        device == nullptr ? nullptr : ctx->func_lib(device);
+    if (device != nullptr && flr == nullptr) {
       return errors::Unavailable(
           "Unable to find a FunctionLibraryRuntime corresponding to device ",
           device->name());
     }
-    auto runner = (flr->runner() != nullptr) ? flr->runner() : ctx->runner();
+    auto runner = (flr != nullptr && flr->runner() != nullptr) ? flr->runner()
+                                                               : ctx->runner();
     GraphCollector* graph_collector = nullptr;
     if (ctx->ShouldStoreGraphs()) {
       graph_collector = ctx->GetGraphCollector();
@@ -409,7 +432,7 @@
     // Treat the function as multi_device only when we are not compiling
     // it wholly with XLA. When compiling wholly with XLA, flr->CreateKernel
     // will create an XlaLaunchOp kernel to compile and run the function.
-    if (is_multi_device_function && !compile_with_xla) {
+    if (run_function_with_flr) {
       // Multi-device functions don't use the rendezvous from eager context.
       // If we use that rendezvous, multiple concurrent calls to the same
       // function will likely result in collisions. However, this also means
@@ -446,13 +469,11 @@
                                    *num_retvals);
   }
   *num_retvals = output_dtypes_size;
-  if (device == nullptr) {
-    // TODO(apassos) debug how the assignment below might return a different
-    // device from the one requested above.
-    device = kernel->device();
-  }
+  const string& device_name = kernel->device() == nullptr
+                                  ? unspecified_device_name
+                                  : kernel->device()->name();
   status = ValidateInputTypeAndPlacement(
-      ctx, device, op, kernel,
+      ctx, device_name, op, kernel,
       ctx->ShouldStoreStepStats() ? ctx->RunMetadataProto() : nullptr);
   if (!status.ok()) return status;
   std::unique_ptr<NodeExecStats> maybe_stats;
@@ -488,16 +509,16 @@
           /* resource_device= */ kernel->OutputResourceDevice(i),
           output_dtypes[i], ctx);
     }
-    EagerNode* node = new ExecuteNode(
-        id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
-        maybe_step_stats, graph_collector, output_dtypes, *retvals);
+    EagerNode* node = new ExecuteNode(id, ctx, op->Inputs(), kernel,
+                                      maybe_stats.release(), maybe_step_stats,
+                                      graph_collector, output_dtypes, *retvals);
     ctx->ExecutorAdd(node);
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
-    status = EagerKernelExecute(ctx, op->Device(), op->Inputs(), kernel,
-                                maybe_stats.get(), maybe_step_stats,
-                                graph_collector, retvals->data(), *num_retvals);
+    status = EagerKernelExecute(ctx, op->Inputs(), kernel, maybe_stats.get(),
+                                maybe_step_stats, graph_collector,
+                                retvals->data(), *num_retvals);
   }
 
   return status;
@@ -652,8 +673,8 @@
       // correctly determined after the kernel is selected/instantiated, since
       // the op might have its inputs on host memory.
       TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-          op, op->Device(), i, remote_cpu_device, /* run_metadata= */ nullptr,
-          &(*op->MutableInputs())[i]));
+          op, op->Device()->name(), i, remote_cpu_device,
+          /* run_metadata= */ nullptr, &(*op->MutableInputs())[i]));
     }
 
     tensorflow::TensorHandle* input = op->Inputs()[i];
@@ -778,44 +799,6 @@
          !absl::StartsWith(op_type, "XRT");
 }
 
-Status MaybeUpdateFunctionOpDevice(EagerOperation* op) {
-  gtl::FlatMap<Device*, int> device_counts;
-  Device* op_device =
-      op->Device() == nullptr ? op->EagerContext()->HostCPU() : op->Device();
-  for (int i = 0; i < op->Inputs().size(); ++i) {
-    TensorHandle* tensor_handle = op->Inputs()[i];
-    if (tensor_handle->dtype == DT_RESOURCE) {
-      Device* resource_device = tensor_handle->resource_device();
-      device_counts[resource_device]++;
-      VLOG(2) << "for op " << op->Name() << " input " << i << " "
-              << DataTypeString(tensor_handle->dtype)
-              << " input device = " << resource_device->name()
-              << ", op device = " << op_device->name();
-    }
-  }
-
-  Device* target_device = nullptr;
-  int target_device_count = 0;
-
-  for (const auto& kv : device_counts) {
-    if (kv.second > target_device_count) {
-      target_device_count = kv.second;
-      target_device = kv.first;
-    }
-  }
-
-  if (target_device != nullptr &&
-      (target_device != op_device || op->Device() == nullptr)) {
-    VLOG(1) << (target_device != op_device ? "Changing " : "Setting ")
-            << "device of operation " << op->Name() << " to "
-            << target_device->name() << " because most inputs are resources on"
-            << " this device.";
-    op->SetDevice(target_device);
-  }
-
-  return Status::OK();
-}
-
 // The Op device may be updated if:
 // - A resource touching input is specified: all resource-touching ops run in
 // the device the resource is, regardless of anything else that has been
@@ -826,7 +809,13 @@
 // "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
 Status MaybeUpdateOpDevice(EagerOperation* op) {
   if (op->is_function()) {
-    return MaybeUpdateFunctionOpDevice(op);
+    // Don't update the device of direct function calls.
+    // Particularly, if the user did not explicitly request any device for this
+    // function, picking a device would result in this device being the default
+    // for nodes inside the function. This is undesirable for multi-device
+    // functions since the not-explicitly-placed nodes inside the body will all
+    // end up on this default device.
+    return Status::OK();
   }
   EagerContext* ctx = op->EagerContext();
   bool all_inputs_eligible_for_cpu_pinning =
@@ -920,18 +909,12 @@
   return EagerRemoteExecute(op, retvals->data(), num_retvals);
 }
 
-Status EagerKernelExecute(EagerContext* ctx, Device* device,
+Status EagerKernelExecute(EagerContext* ctx,
                           const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                           KernelAndDevice* kernel, NodeExecStats* maybe_stats,
                           StepStats* maybe_step_stats,
                           GraphCollector* graph_collector,
                           TensorHandle** retvals, int num_retvals) {
-  if (device == nullptr) {
-    // TODO(apassos) debug how the assignment below might return a different
-    // device from the one requested above.
-    device = kernel->device();
-  }
-
   std::vector<Tensor> outputs(1);
 
   // If there are multiple references to a TensorHandle in 'op_inputs' we must
@@ -1008,16 +991,22 @@
           step_stats->add_dev_stats();
         }
         // Find the current device's index.
+        // If device is a nullptr (we are running a function without explicitly
+        // requested device), attribute the function runtime to CPU.
+        Device* attribution_device = kernel->device();
+        if (attribution_device == nullptr) {
+          attribution_device = ctx->HostCPU();
+        }
         int device_idx = 0;
         for (int i = 0; i < ctx->devices()->size(); ++i) {
-          if (ctx->devices()->at(i) == device) {
+          if (ctx->devices()->at(i) == attribution_device) {
             device_idx = i;
             break;
           }
         }
         // Populate the device stats for this device.
         auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-        dev_stats->set_device(device->name());
+        dev_stats->set_device(attribution_device->name());
         *dev_stats->add_node_stats() = *maybe_stats;
       }
     }
@@ -1027,12 +1016,12 @@
     if (retvals[i] == nullptr) {
       retvals[i] =
           new TensorHandle(outputs[i], /* d= */ kernel->OutputDevice(i),
-                           /* op_device= */ device, ctx);
+                           /* op_device= */ kernel->device(), ctx);
     } else {
       // In the async case, the retval is not a nullptr, and its device is
       // already set since all TensorHandles always have their device set
-      // during construction.
-      DCHECK_EQ(device, retvals[i]->op_device());
+      // (potentially to nullptr) during construction.
+      DCHECK_EQ(kernel->device(), retvals[i]->op_device());
       DCHECK_EQ(kernel->OutputDevice(i), retvals[i]->device());
 
       retvals[i]->SetTensor(outputs[i]);
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 4945688..b05139a 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -41,9 +41,9 @@
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* retvals,
     int* num_retvals);
 
-// Low-level utility to execute the kernel specified by kernel on device
-// 'device', with the inputs op_inputs, in the context 'ctx'.
-Status EagerKernelExecute(EagerContext* ctx, Device* device,
+// Low-level utility to execute the kernel specified by `kernel` on
+// `kernel->device()`, with the inputs op_inputs, in the context 'ctx'.
+Status EagerKernelExecute(EagerContext* ctx,
                           const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                           KernelAndDevice* kernel, NodeExecStats* maybe_stats,
                           StepStats* maybe_step_stats,
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index 4459e32..723b22d 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -31,7 +31,7 @@
 
 class ExecuteNode : public EagerNode {
  public:
-  ExecuteNode(uint64 id, EagerContext* ctx, Device* op_device,
+  ExecuteNode(uint64 id, EagerContext* ctx,
               const tensorflow::gtl::InlinedVector<TensorHandle*, 4>& inputs,
               KernelAndDevice* kernel, NodeExecStats* maybe_stats,
               StepStats* maybe_step_stats, GraphCollector* graph_collector,
@@ -39,7 +39,6 @@
               const tensorflow::gtl::InlinedVector<TensorHandle*, 2>& retvals)
       : EagerNode(id),
         ctx_(ctx),
-        op_device_(op_device),
         inputs_(inputs),
         kernel_(kernel),
         maybe_stats_(maybe_stats),
@@ -65,8 +64,8 @@
 
   tensorflow::Status Run() override {
     const Status status = EagerKernelExecute(
-        ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
-        maybe_step_stats_, graph_collector_, retvals_.begin(), retvals_.size());
+        ctx_, inputs_, kernel_, maybe_stats_.get(), maybe_step_stats_,
+        graph_collector_, retvals_.begin(), retvals_.size());
     if (status.ok()) {
       return status;
     } else {
@@ -79,7 +78,6 @@
 
  private:
   tensorflow::EagerContext* ctx_;
-  tensorflow::Device* op_device_;
   tensorflow::gtl::InlinedVector<TensorHandle*, 4> inputs_;
   tensorflow::KernelAndDevice* kernel_;
   std::unique_ptr<NodeExecStats> maybe_stats_;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 8a06aa8..5a61c76 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -67,6 +67,11 @@
 Status KernelAndDeviceOp::Init(const NodeDef& ndef,
                                GraphCollector* graph_collector) {
   OpKernel* k = nullptr;
+  if (flr_ == nullptr) {
+    return errors::Internal(
+        "A valid FunctionLibraryRuntime must be provided when running ops "
+        "based on OpKernel.");
+  }
   TF_RETURN_IF_ERROR(flr_->CreateKernel(ndef, &k));
   kernel_.reset(k);
   return Status::OK();
@@ -75,8 +80,18 @@
 Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
                                  GraphCollector* graph_collector) {
   const OpDef* op_def = nullptr;
-  const FunctionDef* function_def =
-      flr_->GetFunctionLibraryDefinition()->Find(ndef.op());
+  const FunctionDef* function_def;
+  if (flr_ == nullptr) {
+    // If function is being executed without an explicit device request,
+    // lookup the FunctionDef in the CPU's FLR. All FLRs share the same
+    // library.
+    function_def = pflr_->GetFLR(host_cpu_device_->name())
+                       ->GetFunctionLibraryDefinition()
+                       ->Find(ndef.op());
+  } else {
+    function_def = flr_->GetFunctionLibraryDefinition()->Find(ndef.op());
+  }
+
   if (function_def != nullptr) {
     op_def = &(function_def->signature());
   } else {
@@ -86,7 +101,7 @@
       InOutTypesForNode(ndef, *op_def, &input_dtypes_, &output_dtypes_));
 
   FunctionLibraryRuntime::InstantiateOptions options;
-  options.target = device_->name();
+  options.target = device_ == nullptr ? "" : device_->name();
   options.is_multi_device_function = true;
   for (const Device* device : input_devices_) {
     options.input_devices.push_back(device->name());
@@ -133,10 +148,10 @@
   return Status::OK();
 }
 
-Status KernelAndDevice::Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
-                            std::vector<Tensor>* outputs, NodeExecStats* stats,
-                            StepStats* step_stats,
-                            GraphCollector* graph_collector) {
+Status KernelAndDeviceOp::Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
+                              std::vector<Tensor>* outputs,
+                              NodeExecStats* stats, StepStats* step_stats,
+                              GraphCollector* graph_collector) {
   ScopedStepContainer step_container(0, [this](const string& name) {
     device_->resource_manager()->Cleanup(name).IgnoreError();
   });
@@ -144,6 +159,20 @@
                    graph_collector);
 }
 
+Status KernelAndDeviceFunc::Run(
+    const gtl::InlinedVector<TensorValue, 4>& inputs,
+    std::vector<Tensor>* outputs, NodeExecStats* stats, StepStats* step_stats,
+    GraphCollector* graph_collector) {
+  const std::vector<Device*> devices = pflr_->device_mgr()->ListDevices();
+  ScopedStepContainer step_container(0, [&devices](const string& name) {
+    for (Device* device : devices) {
+      device->resource_manager()->Cleanup(name).IgnoreError();
+    }
+  });
+  return this->Run(&step_container, inputs, outputs, stats, step_stats,
+                   graph_collector);
+}
+
 namespace {
 void UpdateStats(OpKernelContext* context,
                  StepStatsCollector* step_stats_collector,
@@ -179,6 +208,11 @@
                               std::vector<Tensor>* outputs,
                               NodeExecStats* stats, StepStats* step_stats,
                               GraphCollector* graph_collector) {
+  gtl::InlinedVector<AllocatorAttributes, 4> in_attrs(kernel_->num_inputs());
+  for (size_t i = 0; i < in_attrs.size(); ++i) {
+    in_attrs[i].set_on_host(kernel_->input_memory_types()[i] ==
+                            tensorflow::HOST_MEMORY);
+  }
   std::vector<AllocatorAttributes> out_attrs(kernel_->num_outputs());
   for (size_t i = 0; i < out_attrs.size(); ++i) {
     out_attrs[i].set_on_host(kernel_->output_memory_types()[i] ==
@@ -200,6 +234,7 @@
   params.inputs = &inputs;
   params.op_kernel = kernel_.get();
   params.resource_manager = device_->resource_manager();
+  params.input_alloc_attrs = &in_attrs;
   params.output_attr_array = gtl::vector_as_array(&out_attrs);
   params.function_library = flr_;
   params.slice_reader_cache = &slice_reader_cache_;
@@ -278,8 +313,10 @@
   // function library runtime to create a new for this call. We could have
   // created one here but it requires more state to be kept in
   // KernelAndDeviceFunc.
-  opts.rendezvous = nullptr;
-  opts.create_rendezvous = true;
+  Rendezvous* rendezvous = new IntraProcessRendezvous(pflr_->device_mgr());
+  opts.rendezvous = rendezvous;
+  opts.create_rendezvous = false;
+
   opts.cancellation_manager = &cm_;
   cm_.Reset();
   // eager runtime does not yet support collective ops.
@@ -305,13 +342,14 @@
     input_vector.push_back(*tensor_value.tensor);
   }
 
-  flr_->Run(opts, handle_, input_vector, outputs,
-            [&status, &done](const Status& s) {
-              status = s;
-              done.Notify();
-            });
+  pflr_->Run(opts, handle_, input_vector, outputs,
+             [&status, &done](const Status& s) {
+               status = s;
+               done.Notify();
+             });
   done.WaitForNotification();
 
+  rendezvous->Unref();
   if (step_stats_collector != nullptr) {
     step_stats_collector->Finalize();
   }
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 6f95f2e..e9573b0 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -60,12 +60,14 @@
 
   // Non-multi-device functions are run using regular CallOp and look like
   // primitive operations from KernelAndDevice perspective.
+  // `flr` can be nullptr if the operation is not run on any specific device
+  // (currently can happen only for multi-device functions).
   KernelAndDevice(
       FunctionLibraryRuntime* flr,
       std::function<void(std::function<void()>)>* runner,
       std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
       Device* host_cpu_device)
-      : device_(flr->device()),
+      : device_(flr == nullptr ? nullptr : flr->device()),
         host_cpu_device_(host_cpu_device),
         flr_(flr),
         runner_(runner),
@@ -76,9 +78,10 @@
   virtual ~KernelAndDevice() {}
 
   // TODO(ashankar): Handle list-valued inputs.
-  Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
-             std::vector<Tensor>* outputs, NodeExecStats* stats,
-             StepStats* step_stats, GraphCollector* graph_collector);
+  virtual Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
+                     std::vector<Tensor>* outputs, NodeExecStats* stats,
+                     StepStats* step_stats,
+                     GraphCollector* graph_collector) = 0;
 
   virtual Status Run(ScopedStepContainer* step_container,
                      const gtl::InlinedVector<TensorValue, 4>& inputs,
@@ -92,7 +95,8 @@
   // Else, returns nullptr.
   virtual Device* OutputResourceDevice(int idx) const = 0;
 
-  // Returns nullptr for functions.
+  // Returns the kernel that will be used to run this.
+  // Returns nullptr if this will be run using function library runtime.
   virtual const OpKernel* kernel() const = 0;
 
   // Returns the device on which this kernel will run. In the case of
@@ -114,9 +118,9 @@
   // provided here only for the few kernels which can't handle one being
   // missing.
   CancellationManager cm_;
-  Device* const device_;           // non-null
-  Device* const host_cpu_device_;  // non-null
-  FunctionLibraryRuntime* const flr_;
+  Device* const device_;               // can be null
+  Device* const host_cpu_device_;      // non-null
+  FunctionLibraryRuntime* const flr_;  // can be null
   std::function<void(std::function<void()>)>* const runner_;
   std::function<void(std::function<void()>)> default_runner_;
   const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
@@ -140,7 +144,9 @@
 
   Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override;
 
-  using KernelAndDevice::Run;
+  Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
+             std::vector<Tensor>* outputs, NodeExecStats* stats,
+             StepStats* step_stats, GraphCollector* graph_collector) override;
 
   Status Run(ScopedStepContainer* step_container,
              const gtl::InlinedVector<TensorValue, 4>& inputs,
@@ -181,6 +187,9 @@
 // In such cases, KernelAndDeviceOp is used.
 class KernelAndDeviceFunc final : public KernelAndDevice {
  public:
+  // `flr` can be nullptr.
+  // `pflr` must not be nullptr.
+  // `host_cpu_device` must not be nullptr.
   KernelAndDeviceFunc(
       FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
       std::vector<Device*> input_devices,
@@ -197,8 +206,9 @@
 
   Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override;
 
-  using KernelAndDevice::Run;
-
+  Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
+             std::vector<Tensor>* outputs, NodeExecStats* stats,
+             StepStats* step_stats, GraphCollector* graph_collector) override;
   Status Run(ScopedStepContainer* step_container,
              const gtl::InlinedVector<TensorValue, 4>& inputs,
              std::vector<Tensor>* outputs, NodeExecStats* stats,
@@ -218,7 +228,7 @@
   int num_outputs() const override { return output_dtypes_.size(); }
 
  private:
-  ProcessFunctionLibraryRuntime* const pflr_;
+  ProcessFunctionLibraryRuntime* const pflr_;  // non-null
   FunctionLibraryRuntime::Handle handle_;
   // CPU devices are null. Resource handles' devices are actual backing
   // devices.
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index ac99fdb..f530f0a 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -158,6 +158,9 @@
 
   // Device in which the op producing this tensor was executed. Equals to
   // device_ for constant tensors.
+  // Can be nullptr if the op producing this tensor was a function executed
+  // with function library runtime or if this tensor represents a symbolic
+  // tensor.
   tensorflow::Device* const op_device_;
 
   // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 600a935..488d0c7 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -1442,8 +1442,19 @@
   return Status::OK();
 }
 
+using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
+
 }  // namespace
 
+string InlineFunctionBodyOptions::DebugString() const {
+  return absl::StrCat("ignore_noinline=", ignore_noinline ? "true" : "false",
+                      ", override_device=", override_device ? "true" : "false",
+                      ", output_control_src=",
+                      output_control_src == OutputControlSrc::kDataOutputs
+                          ? "DataOutputs"
+                          : "ControlOutputs");
+}
+
 Status ValidateInlining(const Node* node, const FunctionBody* fbody,
                         const InlineFunctionBodyOptions& options) {
   // TODO(ezhulenev): Currently common_runtime function inlining can't guarantee
@@ -1544,8 +1555,8 @@
 // 2) Create "output_control_node" NoOp. All nodes that have incoming control
 //    edge *from* the function call node, will be forwarded to this node.
 //
-//    We have two options for choosing which nodes will a control edge *to* the
-//    "output control node":
+//    We have two options for choosing which nodes will have a control edge *to*
+//    the "output control node":
 //       a) control returns            (`control_ret` field in FunctionDef)
 //       b) data returns               (`ret` field in FunctionDef)
 //
@@ -1574,7 +1585,8 @@
 Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
                           Node* caller, const FunctionBody* fbody,
                           const InlineFunctionBodyOptions& options) {
-  VLOG(3) << "Inline function call: " << SummarizeNode(*caller);
+  VLOG(3) << "Inline function call: " << SummarizeNode(*caller) << " ["
+          << options.DebugString() << "]";
   VLOG(4) << "Inlined function definition: " << DebugString(fbody->fdef);
 
   Status validation = ValidateInlining(caller, fbody, options);
@@ -1585,8 +1597,8 @@
   }
 
   // ------------------------------------------------------------------------ //
-  // We insert NoOps before/after inlined function body nodes, to enforce
-  // side-effects execution order.
+  // Helper functions to create `NoOp` and `Identity` nodes for auxiliary
+  // control nodes and inlined function inputs and outputs.
 
   // Add a NoOp node for function control inputs/outputs.
   const auto no_op = [&](StringPiece name) {
@@ -1710,16 +1722,17 @@
   // ------------------------------------------------------------------------ //
   // Connect output edges.
   //
-  // For i-th return node in fbody->graph, we add in "g" an identity
-  // node (outputs[i-th]). We then reconnect every incoming edge into
-  // the i-th return node to the added identity node.
+  // For i-th return node in fbody->graph, we add in "g" an identity node
+  // (outputs[i-th]). We then reconnect every incoming edge into the i-th return
+  // node to the added identity node.
   //
-  // For every data edge coming out of "callee"s i-th output, we
-  // reconnect it to the i-th identity added above.
+  // For every data edge coming out of "callee"s i-th output, we reconnect it to
+  // the i-th identity added above.
   //
-  // If "callee" is control-depended upon by any other nodes, we add a
-  // NoOp node "output_control_node". "output_control_node" depends on
-  // all identity nodes added above. And nodes previously depend on
+  // If "callee" is control-depended upon by any other nodes, we add a NoOp node
+  // "output_control_node". "output_control_node" depends on all identity nodes
+  // added above or on all control return nodes (controlled by
+  // `options.output_control_src` value). And nodes previously depend on
   // "callee" is changed to depend on "output_control_node".
   std::vector<Node*> outputs(caller->num_outputs());
   for (std::size_t i = 0; i < fbody->ret_nodes.size(); ++i) {
@@ -1746,8 +1759,16 @@
     if (e->IsControlEdge()) {
       if (output_control_node == nullptr) {
         output_control_node = no_op("output_control_node");
-        for (Node* n : outputs) {
-          g->AddControlEdge(n, output_control_node);
+        if (options.output_control_src ==
+            InlineFunctionBodyOptions::OutputControlSource::kDataOutputs) {
+          for (Node* n : outputs) {
+            g->AddControlEdge(n, output_control_node);
+          }
+        } else {
+          for (Node* fbody_node : fbody->control_ret_nodes) {
+            Node* n = node_map[fbody_node->id()];
+            g->AddControlEdge(n, output_control_node);
+          }
         }
       }
       g->AddControlEdge(output_control_node, e->dst());
@@ -1768,7 +1789,7 @@
 }
 
 bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
-                           const InlineFunctionBodyOptions& options) {
+                           const ExpandInlineFunctionsOptions& options) {
   std::vector<std::pair<Node*, const FunctionBody*>> candidates;
 
   const FunctionLibraryDefinition* fld = lib->GetFunctionLibraryDefinition();
@@ -1797,8 +1818,10 @@
 
   bool inlined_any = false;
   for (const auto& p : candidates) {
-    Status inlined =
-        InlineFunctionBody(*fld, graph, p.first, p.second, options);
+    Status inlined = InlineFunctionBody(*fld, graph, p.first, p.second,
+                                        p.first->IsPartitionedCall()
+                                            ? options.multi_device_options
+                                            : options.native_options);
     if (inlined.ok()) {
       inlined_any = true;
     } else {
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index b6db1cb..86b4d21 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -160,11 +160,26 @@
 FunctionBody* SymbolicGradient(const FunctionBody& f);
 
 struct InlineFunctionBodyOptions {
+  // All nodes that have incoming control edge *from* the function call node,
+  // will be forwarded to the "output control node". There are two options for
+  // choosing which nodes will have a control edge *to* the "output control
+  // node":
+  //   a) control returns            (`control_ret` field in FunctionDef)
+  //   b) data returns               (`ret` field in FunctionDef)
+  enum class OutputControlSource { kDataOutputs, kControlOutputs };
+
   // Ignore '_noinline' function attribute.
   bool ignore_noinline = false;
   // If 'true' function inlining will override explicitly specified devices
   // inside function body with the caller node device.
   bool override_device = false;
+  // For compatibility with Tensorflow v1 by default we will use data outputs.
+  // Control returns were added to Tensorflow v2 with automatic control
+  // dependencies tracking in Eager mode.
+  OutputControlSource output_control_src = OutputControlSource::kDataOutputs;
+
+  // A human-readable debug string for this options.
+  string DebugString() const;
 };
 
 // Returns 'Status::OK()' iff the function '*fbody' can be inlined at 'node'
@@ -192,6 +207,48 @@
                           Node* caller, const FunctionBody* fbody,
                           const InlineFunctionBodyOptions& options);
 
+// There are three types of function calls that could be invoked during
+// *Tensorflow graph execution*:
+//
+// 1) Native function call (node.type_string() is the function name). These
+//    functions are always executed on a single-device, which is the device of
+//    the function call node.
+//
+// 2) Multi-device function calls (PartitionedCall or StatefulPartitionedCall
+//    ops) can execute on multiple devices and accept DT_RESOURCE inputs that
+//    belong to different devices. This type of functions was added in
+//    Tensorflow 2.0 Eager mode, and it has control outputs to represent
+//    side-effects that must always execute (see `control_ret` in FunctionDef).
+//
+// 3) SymbolicGradient has been deprecated for a while, but we still keep it and
+//    use `native` options for inlining for compatibility.
+//
+// We need to have distinct inlining rules for compatibility with Tensorflow v1.
+//
+// There are few other places in Tensorflow that could execute functions:
+//
+// 1) common_runtime/eager/kernel_and_device.{h,cc} - executes "top level"
+//    functions directly via function library runtime, without going through
+//    the graph.
+// 2) tf.data pipelines - also execute functions directly via function library
+//    runtime with custom executors.
+struct ExpandInlineFunctionsOptions {
+  ExpandInlineFunctionsOptions() : native_options(), multi_device_options() {
+    using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
+    multi_device_options.output_control_src = OutputControlSrc::kControlOutputs;
+  }
+
+  InlineFunctionBodyOptions native_options;
+  InlineFunctionBodyOptions multi_device_options;
+};
+
+// WARNING(ezhulenev): PLEASE DO NOT USE THIS FUNCTION. This is a temporary
+// workaround that will be enabled only during the function inlining unification
+// (b/126811947). Contact ezhulenev@ if you think you need it.
+// TODO(ezhulenev): Delete this function.
+bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
+                           const ExpandInlineFunctionsOptions& options);
+
 // For each node in "graph", if "lib" indicates that the node is a
 // function call, inline the function body. Returns true if at least
 // one node is inlined.
@@ -203,13 +260,11 @@
 // Function calls that can't be safely inlined into the graph (ValidateInlining
 // returns error), are ignored.
 //
-// If `override_device` is true then the inlined operations are placed on the
-// device the call node is placed on.
-bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
-                           const InlineFunctionBodyOptions& options);
-
+// TODO(ezhulenev): We do not FunctionLibraryRuntime for this. We need just the
+// FunctionLibraryDefinition and FunctionDefToBodyHelper to implement this (see
+// lower_function_call.cc).
 inline bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
-  return ExpandInlineFunctions(lib, graph, InlineFunctionBodyOptions());
+  return ExpandInlineFunctions(lib, graph, ExpandInlineFunctionsOptions());
 }
 
 // Extracts function name and attributes from `call_def` and invokes
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 72b2b14..15910af 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -801,7 +801,7 @@
 
 // Verifies that control dependencies on the caller are added as control
 // dependencies on any function calls created by inlining.
-TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
+TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithInputControlEdges) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour()});
 
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
@@ -885,6 +885,99 @@
   }
 }
 
+TEST_F(FunctionLibraryRuntimeTest,
+       ExpandInlineFunctionsWithOutputControlEdges) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+  using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
+
+  // `add` node is not required to compute regular output `o`, but it must
+  // execute because it is in `control_ret`.
+  const FunctionDef func =
+      FDH::Create("FunctionWithControlOutputs", {"i: float"}, {"o: float"}, {},
+                  {
+                      {{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+                      {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}},
+                  },
+                  /*ret_def=*/{{"o", "ret:z:0"}},
+                  /*control_ret_def=*/{{"must_execute", "add"}});
+
+  Init({func});
+
+  // Construct a graph for the function call:
+  //
+  //   a = Arg[dtype=DT_FLOAT]
+  //   b = FunctionWithControlOutputs(a)
+  //   c = NoOp(^b)
+  //   ret = RetVal(b, ^c)
+  const auto init_graph = [this](std::unique_ptr<Graph>* g) -> void {
+    g->reset(new Graph(OpRegistry::Global()));
+
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto b = test::function::Call(&s, "b", "FunctionWithControlOutputs", {a});
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto ret = ops::_Retval(s.WithOpName("ret"), b, 0);
+    s.graph()->AddControlEdge(b.node(), c.operation.node());
+    s.graph()->AddControlEdge(c.operation.node(), ret.operation.node());
+    TF_ASSERT_OK(s.ToGraph(g->get()));
+  };
+
+  std::unique_ptr<Graph> g;
+  ExpandInlineFunctionsOptions opts;
+
+  const string input_node = "Func/b/input/_0";
+  const string output_node = "Func/b/output/_1";
+  const string output_control_node = "Func/b/output_control_node/_2";
+
+  // Use data outputs as output control source.
+  opts.native_options.output_control_src = OutputControlSrc::kDataOutputs;
+
+  init_graph(&g);
+  ExpandInlineFunctions(flr0_, g.get(), opts);
+  {
+    GraphDef expected = test::function::GDef(
+        {NDef("a", "_Arg", {}, {{"T", DT_FLOAT}, {"index", 0}}),
+         NDef(input_node, "Identity", {"a"}, {{"T", DT_FLOAT}}),
+         NDef("b/add", "Add", {input_node, input_node}, {{"T", DT_FLOAT}}),
+         NDef("b/ret", "Mul", {input_node, input_node}, {{"T", DT_FLOAT}}),
+         NDef(output_node, "Identity", {"b/ret"}, {{"T", DT_FLOAT}}),
+         NDef(output_control_node, "NoOp", {"^Func/b/output/_1"}, {}),
+         NDef("c", "NoOp", {"^" + output_control_node}, {}),
+         NDef("ret", "_Retval", {output_node, "^c"},
+              {{"T", DT_FLOAT}, {"index", 0}})},
+        {func});
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  // Use control outputs as output control source.
+  opts.native_options.output_control_src = OutputControlSrc::kControlOutputs;
+
+  init_graph(&g);
+  ExpandInlineFunctions(flr0_, g.get(), opts);
+  {
+    GraphDef expected = test::function::GDef(
+        {NDef("a", "_Arg", {}, {{"T", DT_FLOAT}, {"index", 0}}),
+         NDef(input_node, "Identity", {"a"}, {{"T", DT_FLOAT}}),
+         NDef("b/add", "Add", {input_node, input_node}, {{"T", DT_FLOAT}}),
+         NDef("b/ret", "Mul", {input_node, input_node}, {{"T", DT_FLOAT}}),
+         NDef(output_node, "Identity", {"b/ret"}, {{"T", DT_FLOAT}}),
+         NDef(output_control_node, "NoOp", {"^b/add"}, {}),
+         NDef("c", "NoOp", {"^" + output_control_node}, {}),
+         NDef("ret", "_Retval", {output_node, "^c"},
+              {{"T", DT_FLOAT}, {"index", 0}})},
+        {func});
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+}
+
 TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
   auto T = DT_INT32;
   FunctionDef stateful_func = FDH::Define(
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index b185ea1..31455e5 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -571,8 +571,11 @@
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
 
-  Placer placer(new_graph.get(), device_set_, session_options_,
-                /* default_device= */ nullptr);
+  Placer placer(new_graph.get(), device_set_, /* default_device= */ nullptr,
+                session_options_ == nullptr ||
+                    session_options_->config.allow_soft_placement(),
+                session_options_ != nullptr &&
+                    session_options_->config.log_device_placement());
   // TODO(mrry): Consider making the Placer cancelable.
   TF_RETURN_IF_ERROR(placer.Run());
 
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index f5352ec..465cddf 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -87,10 +87,10 @@
       changed = true;
     }
     if (opts_.do_function_inlining()) {
-      InlineFunctionBodyOptions inline_opts;
-      inline_opts.override_device = true;
+      ExpandInlineFunctionsOptions expand_inline_opts;
+      expand_inline_opts.native_options.override_device = true;
 
-      bool was_mutated = ExpandInlineFunctions(runtime, g, inline_opts);
+      bool was_mutated = ExpandInlineFunctions(runtime, g, expand_inline_opts);
       if (was_mutated) {
         DumpGraph("ExpandInlineFunctions", g);
         changed = true;
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index d51caae..d700040 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -82,12 +82,12 @@
   // in the original function.
   for (Node* node : subgraph->op_nodes()) {
     string node_type = node->type_string();
-    if (node_type == FunctionLibraryDefinition::kArgOp) {
+    if (node->IsArg()) {
       TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
       int index = static_cast<int>(attr_value->i());
       arg_indices->push_back(index);
       arg_nodes.push_back(std::make_pair(node, index));
-    } else if (node_type == FunctionLibraryDefinition::kRetOp) {
+    } else if (node->IsRetval()) {
       TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
       int index = static_cast<int>(attr_value->i());
       ret_indices->push_back(index);
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
index 0d4e362..705b52a 100644
--- a/tensorflow/core/common_runtime/partitioning_utils_test.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -68,8 +68,7 @@
     TF_ASSERT_OK(s.ToGraph(graph));
 
     if (assign_device) {
-      Placer placer(graph, &device_set_, nullptr, /* No session options */
-                    device0_);
+      Placer placer(graph, &device_set_, device0_);
       TF_ASSERT_OK(placer.Run());
     }
   }
@@ -85,8 +84,7 @@
     auto dx_retval = ops::_Retval(s2.WithOpName("retval1"), id_y, 0);
     auto dy_retval = ops::_Retval(s1.WithOpName("retval2"), id_x, 1);
     TF_ASSERT_OK(s.ToGraph(graph));
-    Placer placer(graph, &device_set_, nullptr, /* No session options */
-                  device0_);
+    Placer placer(graph, &device_set_, device0_);
     TF_ASSERT_OK(placer.Run());
   }
 
@@ -100,8 +98,7 @@
     auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
     auto dx_retval = ops::_Retval(s1.WithOpName("retval1"), id_x, ret_index);
     TF_ASSERT_OK(s.ToGraph(subgraph));
-    Placer placer(subgraph, &device_set_, nullptr, /* No session options */
-                  device0_);
+    Placer placer(subgraph, &device_set_, device0_);
     TF_ASSERT_OK(placer.Run());
   }
 
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index b2f4f1a..2ea1b6d 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -79,16 +79,20 @@
 }  // namespace
 
 Placer::Placer(Graph* graph, const DeviceSet* devices,
-               const SessionOptions* options, const Device* default_device)
+               const Device* default_device, bool allow_soft_placement,
+               bool log_device_placement)
     : graph_(graph),
       devices_(devices),
-      options_(options),
-      log_device_placement_(options != nullptr &&
-                            options->config.log_device_placement()),
-      default_device_(default_device) {}
+      default_device_(default_device),
+      allow_soft_placement_(allow_soft_placement),
+      log_device_placement_(log_device_placement) {}
+
+Placer::Placer(Graph* graph, const DeviceSet* devices,
+               const Device* default_device)
+    : Placer(graph, devices, default_device, true, false) {}
 
 Placer::Placer(Graph* graph, const DeviceSet* devices)
-    : Placer(graph, devices, nullptr, nullptr) {}
+    : Placer(graph, devices, nullptr, true, false) {}
 
 Placer::~Placer() {}
 
@@ -106,10 +110,9 @@
     }
   }
 
-  ColocationGraph colocation_graph(
-      graph_, devices_, default_device_,
-      options_ == nullptr || options_->config.allow_soft_placement(),
-      log_device_placement_);
+  ColocationGraph colocation_graph(graph_, devices_, default_device_,
+                                   allow_soft_placement_,
+                                   log_device_placement_);
 
   TF_RETURN_IF_ERROR(colocation_graph.Initialize());
 
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index e6c5a89..3bb5033 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -68,8 +68,10 @@
   //
   // The "graph", "devices", and "default_device" pointer arguments are borrowed
   // by this Placer, and must outlive it.
-  Placer(Graph* graph, const DeviceSet* devices, const SessionOptions* options,
-         const Device* default_device);
+  Placer(Graph* graph, const DeviceSet* devices, const Device* default_device,
+         bool allow_soft_placement, bool log_device_placement);
+
+  Placer(Graph* graph, const DeviceSet* devices, const Device* default_device);
 
   Placer(Graph* graph, const DeviceSet* devices);
 
@@ -90,9 +92,9 @@
 
   Graph* const graph_;              // Not owned.
   const DeviceSet* const devices_;  // Not owned.
-  const SessionOptions* options_;   // Not owned.
+  const Device* default_device_;    // Not owned.
+  const bool allow_soft_placement_;
   const bool log_device_placement_;
-  const Device* default_device_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(Placer);
 };
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 53848c6..6c056f8 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -239,20 +239,23 @@
   // placement will use the default DeviceSet (of 10 CPU and 10 GPU devices).
   //
   // REQUIRES: "*graph" was produced by the most recent call to BuildGraph.
-  Status Place(Graph* graph, DeviceSet* devices, SessionOptions* options) {
-    Placer placer(graph, devices, options, nullptr);
+  Status Place(Graph* graph, DeviceSet* devices, bool allow_soft_placement,
+               bool log_device_placement) {
+    Placer placer(graph, devices, nullptr, allow_soft_placement,
+                  log_device_placement);
     return placer.Run();
   }
 
   Status Place(Graph* graph, DeviceSet* devices) {
-    return Place(graph, devices, nullptr);
+    return Place(graph, devices, true, false);
   }
 
-  Status Place(Graph* graph, SessionOptions* options) {
-    return Place(graph, &devices_, options);
+  Status Place(Graph* graph, bool allow_soft_placement,
+               bool log_device_placement) {
+    return Place(graph, &devices_, allow_soft_placement, log_device_placement);
   }
 
-  Status Place(Graph* graph) { return Place(graph, &devices_, nullptr); }
+  Status Place(Graph* graph) { return Place(graph, &devices_, true, false); }
 
   // Returns the node in "graph" with the given name.
   //
@@ -752,9 +755,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  options.config.set_allow_soft_placement(true);
-  TF_EXPECT_OK(Place(&g, &options));
+  TF_EXPECT_OK(Place(&g, true, false));
   EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
   EXPECT_DEVICE_CONTAINS(g, "in", "/device:fakecpu");
   EXPECT_DEVICE_TYPE(g, "var", "FakeGPU");
@@ -928,10 +929,7 @@
       }
     }
 
-    SessionOptions options;
-    options.config.set_allow_soft_placement(allow_soft_placement);
-    options.config.set_log_device_placement(true);
-    Status s = Place(&g, &options);
+    Status s = Place(&g, allow_soft_placement, true);
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
     EXPECT_TRUE(str_util::StrContains(
         s.error_message(),
@@ -973,8 +971,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  s = Place(&g, &options);
+  s = Place(&g, false, false);
   TF_EXPECT_OK(s);
   EXPECT_DEVICE_TYPE(g, "var_0", "FakeGPU");
   EXPECT_DEVICE_TYPE(g, "assign", "FakeGPU");
@@ -1005,8 +1002,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  s = Place(&g, &options);
+  s = Place(&g, false, false);
   TF_EXPECT_OK(s);
   EXPECT_DEVICE_TYPE(g, "var_0", "FakeCPU");
   EXPECT_DEVICE_TYPE(g, "assign", "FakeCPU");
@@ -1031,8 +1027,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  s = Place(&g, &options);
+  s = Place(&g, false, false);
   TF_EXPECT_OK(s);
   EXPECT_DEVICE_TYPE(g, "var_0", "FakeCPU");
   EXPECT_DEVICE_TYPE(g, "assign", "FakeCPU");
@@ -1101,10 +1096,7 @@
   }
 
   bool allow_soft_placement = GetParam();
-  SessionOptions options;
-  options.config.set_allow_soft_placement(allow_soft_placement);
-  options.config.set_log_device_placement(true);
-  Status s = Place(&g, &options);
+  Status s = Place(&g, allow_soft_placement, true);
   if (allow_soft_placement) {
     EXPECT_EQ(error::OK, s.code()) << s.ToString();
     EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
@@ -1179,10 +1171,7 @@
   }
 
   bool allow_soft_placement = GetParam();
-  SessionOptions options;
-  options.config.set_allow_soft_placement(allow_soft_placement);
-  options.config.set_log_device_placement(true);
-  Status s = Place(&g, &options);
+  Status s = Place(&g, allow_soft_placement, true);
   if (allow_soft_placement) {
     EXPECT_EQ(error::OK, s.code()) << s.ToString();
   } else {
@@ -1444,9 +1433,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  options.config.set_allow_soft_placement(true);
-  TF_EXPECT_OK(Place(&g, &options));
+  TF_EXPECT_OK(Place(&g, true, false));
   EXPECT_DEVICE_CONTAINS(g, "in", "/device:fakegpu:0");
 }
 
@@ -1461,8 +1448,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakegpu:11"));
 }
@@ -1478,8 +1464,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
   EXPECT_TRUE(str_util::StrContains(s.error_message(),
@@ -1498,8 +1483,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakecpu:0"));
   EXPECT_TRUE(str_util::StrContains(
@@ -1518,8 +1502,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
   EXPECT_TRUE(str_util::StrContains(
@@ -1540,8 +1523,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  Status s = Place(&g, &options);
+  Status s = Place(&g, false, false);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
   EXPECT_TRUE(str_util::StrContains(
@@ -1559,9 +1541,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  options.config.set_allow_soft_placement(true);
-  TF_EXPECT_OK(Place(&g, &options));
+  TF_EXPECT_OK(Place(&g, true, false));
 }
 
 // Test that a graph with device type and reference constraints on
@@ -1588,9 +1568,7 @@
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  SessionOptions options;
-  options.config.set_allow_soft_placement(true);
-  TF_EXPECT_OK(Place(&g, &options));
+  TF_EXPECT_OK(Place(&g, true, false));
   EXPECT_DEVICE_TYPE(g, "var_gpu", "FakeGPU");
   EXPECT_DEVICE_TYPE(g, "force_gpu", "FakeGPU");
   EXPECT_COLOCATED(g, "var_gpu", "force_gpu");
@@ -1729,10 +1707,7 @@
   TF_ASSERT_OK(BuildGraph(graph, &g));
 
   bool allow_soft_placement = GetParam();
-  SessionOptions options;
-  options.config.set_allow_soft_placement(allow_soft_placement);
-  options.config.set_log_device_placement(true);
-  Status s = Place(&g, &options);
+  Status s = Place(&g, allow_soft_placement, true);
   if (allow_soft_placement) {
     EXPECT_EQ(error::OK, s.code()) << s.ToString();
     EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
@@ -1821,9 +1796,7 @@
 
   bool allow_soft_placement = GetParam();
 
-  SessionOptions options;
-  options.config.set_allow_soft_placement(allow_soft_placement);
-  Status s = Place(&g, &options);
+  Status s = Place(&g, allow_soft_placement, false);
   if (allow_soft_placement) {
     EXPECT_EQ(error::OK, s.code()) << s.ToString();
     EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 020bb60..36741a9 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -61,7 +61,8 @@
     const OptimizerOptions& optimizer_options,
     thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent)
-    : device_mgr_(device_mgr),
+    : env_(env),
+      device_mgr_(device_mgr),
       lib_def_(lib_def),
       default_thread_pool_(default_thread_pool),
       next_handle_(0),
@@ -86,7 +87,8 @@
     CustomKernelCreator custom_kernel_creator,
     thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent)
-    : device_mgr_(device_mgr),
+    : env_(env),
+      device_mgr_(device_mgr),
       lib_def_(lib_def),
       default_thread_pool_(default_thread_pool),
       next_handle_(0),
@@ -301,7 +303,7 @@
   // arguments. To make sure that the output producing nodes have assigned
   // devices, we assign them to arguments first.
   for (Node* node : graph->op_nodes()) {
-    if (node->type_string() == FunctionLibraryDefinition::kArgOp) {
+    if (node->IsArg()) {
       const AttrValue* attr_value;
       TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
       int64 index = attr_value->i();
@@ -310,7 +312,7 @@
   }
 
   for (Node* node : graph->op_nodes()) {
-    if (node->type_string() == FunctionLibraryDefinition::kRetOp) {
+    if (node->IsRetval()) {
       if (output_devices.empty()) {
         // If output_devices are empty, the node producing retval
         // must have explicitly assigned device or a colocation constraint
@@ -512,7 +514,17 @@
   }
 
   VLOG(1) << "Instantiating MultiDevice function \"" << function_name
-          << "\" on default device " << options.target;
+          << "\" on default device \"" << options.target << "\"";
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << "Requested input devices:";
+    for (const string& device : options.input_devices) {
+      VLOG(3) << "    " << device;
+    }
+    VLOG(3) << "Requested output devices:";
+    for (const string& device : options.output_devices) {
+      VLOG(3) << "    " << device;
+    }
+  }
 
   const FunctionLibraryDefinition* lib_def =
       options.overlay_lib == nullptr ? lib_def_ : options.overlay_lib;
@@ -548,17 +560,6 @@
   TF_RETURN_IF_ERROR(PinArgsAndRets(
       options.input_devices, options.output_devices, device_set, graph.get()));
 
-  // Make the FunctionLibraryRuntime's device the default device if
-  // nothing else is hard coded. This allows the same function definition
-  // to be specialized to different devices depending on the
-  // PartitionedCallOp's device.
-  FunctionLibraryRuntime* flr = GetFLR(options.target);
-  if (flr == nullptr) {
-    return errors::InvalidArgument(
-        "Cannot instantiate multi-device function with target device ",
-        options.target);
-  }
-
   std::unique_ptr<MultiDeviceFunctionData> data =
       MakeUnique<MultiDeviceFunctionData>(function_name, function_key,
                                           ret_node_names.size(),
@@ -567,7 +568,7 @@
   GraphOptimizationPassOptions optimization_options;
   // TODO(iga): Thread other relevant options from SessionOptions.
   SessionOptions session_options;
-  session_options.env = flr->env();
+  session_options.env = env_;
   session_options.config = options.config_proto;
   optimization_options.session_options = &session_options;
   optimization_options.graph = &graph;
@@ -579,10 +580,26 @@
       OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
 
   DumpGraph("Before calling Placer", graph.get());
+  // Make the FunctionLibraryRuntime's device the default device if
+  // nothing else is hard coded. This allows the same function definition
+  // to be specialized to different devices depending on the
+  // PartitionedCallOp's device.
+  Device* default_device = nullptr;
+  if (!options.target.empty()) {
+    FunctionLibraryRuntime* flr = GetFLR(options.target);
+    if (flr == nullptr) {
+      return errors::InvalidArgument(
+          "Cannot instantiate multi-device function with target device ",
+          options.target);
+    }
+    default_device = flr->device();
+  }
+
   // TODO(b/124993244): Smartly merge options in nested defuns, and raise
   // exceptions/warnings in case where nested function call options are ignored.
-  Placer placer(graph.get(), &device_set, &session_options,
-                flr->device() /* Default device */);
+  Placer placer(graph.get(), &device_set, default_device,
+                options.config_proto.allow_soft_placement(),
+                options.config_proto.log_device_placement());
   TF_RETURN_IF_ERROR(placer.Run());
 
   DumpGraph("Before running POST_PLACEMENT passes", graph.get());
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index a08e845..14f3635 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -137,6 +137,8 @@
            std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) const;
 
+  const DeviceMgr* device_mgr() { return device_mgr_; }
+
  private:
   friend class FunctionLibraryRuntimeImpl;
 
@@ -285,6 +287,7 @@
 
   mutable mutex mu_;
 
+  Env* const env_;
   const DeviceMgr* const device_mgr_;
   const FunctionLibraryDefinition* lib_def_;
   thread::ThreadPool* default_thread_pool_;
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index bf00ed9..389ab39 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -680,7 +680,7 @@
 Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result) {
-  VLOG(3) << "Instantiation Function: " << Print(fdef);
+  VLOG(4) << "Instantiation Function: " << Print(fdef);
 
   const OpDef& sig = fdef.signature();
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index cfefa08..f6b49ca 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -87,6 +87,13 @@
         {"FakeParam", NC_FAKE_PARAM},
         {"PartitionedCall", NC_PARTITIONED_CALL},
         {"StatefulPartitionedCall", NC_PARTITIONED_CALL},
+        // Not using the constants defined in FunctionLibraryDefinition for the
+        // 4 ops below because android inference library does not link
+        // tf.function related files.
+        {"_Arg", NC_ARG},
+        {"_DeviceArg", NC_ARG},
+        {"_Retval", NC_RETVAL},
+        {"_DeviceRetval", NC_RETVAL},
     });
 
 #undef REF_CLASS
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 8fa3d81..c463ece 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -174,6 +174,10 @@
   bool IsMetadata() const { return class_ == NC_METADATA; }
   bool IsFakeParam() const { return class_ == NC_FAKE_PARAM; }
   bool IsPartitionedCall() const { return class_ == NC_PARTITIONED_CALL; }
+  // Is this node a function input
+  bool IsArg() const { return class_ == NC_ARG; }
+  // Is this node a function output
+  bool IsRetval() const { return class_ == NC_RETVAL; }
 
   template <typename T>
   void AddAttr(const string& name, const T& val) {
@@ -256,6 +260,8 @@
     NC_COLLECTIVE,
     NC_FAKE_PARAM,
     NC_PARTITIONED_CALL,
+    NC_ARG,
+    NC_RETVAL,
     NC_OTHER  // Not a special kind of node
   };
 
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 35ca93d..84d813f 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -165,6 +165,7 @@
 
 cc_library(
     name = "cost_estimator",
+    srcs = ["cost_estimator.cc"],
     hdrs = ["cost_estimator.h"],
     visibility = ["//visibility:public"],
     deps = [
@@ -173,6 +174,16 @@
     ],
 )
 
+tf_cc_test(
+    name = "cost_estimator_test",
+    srcs = ["cost_estimator_test.cc"],
+    deps = [
+        ":cost_estimator",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "virtual_placer",
     srcs = ["virtual_placer.cc"],
diff --git a/tensorflow/core/grappler/costs/cost_estimator.cc b/tensorflow/core/grappler/costs/cost_estimator.cc
new file mode 100644
index 0000000..0fc4e99
--- /dev/null
+++ b/tensorflow/core/grappler/costs/cost_estimator.cc
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Costs CombineCosts(const Costs& left, const Costs& right) {
+  CHECK_NE(left.max_memory, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_streaming, kMemoryUnknown);
+
+  Costs result = left;
+  result.execution_time += right.execution_time;
+  result.compute_time += right.compute_time;
+  result.memory_time += right.memory_time;
+  result.intermediate_memory_time += right.intermediate_memory_time;
+  result.intermediate_memory_read_time += right.intermediate_memory_read_time;
+  result.intermediate_memory_write_time += right.intermediate_memory_write_time;
+
+  if (right.max_per_op_buffers != kMemoryUnknown) {
+    result.max_per_op_buffers =
+        std::max(left.max_per_op_buffers, right.max_per_op_buffers);
+  }
+  if (right.max_per_op_streaming != kMemoryUnknown) {
+    result.max_per_op_streaming =
+        std::max(left.max_per_op_streaming, right.max_per_op_streaming);
+  }
+
+  result.num_ops_total += right.num_ops_total;
+  if (right.inaccurate) {
+    result.inaccurate = true;
+  }
+  result.num_ops_with_unknown_shapes += right.num_ops_with_unknown_shapes;
+  if (right.max_memory != kMemoryUnknown) {
+    result.max_memory += right.max_memory;
+  }
+
+  return result;
+}
+
+// Multiplies Costs by a scalar.
+// Equivalent to applying CombineCosts "multiplier" times.
+// Note the field regarding num_ops are not multiplied.
+Costs MultiplyCosts(const Costs& costs, int multiplier) {
+  CHECK_GE(multiplier, 0);
+  if (multiplier == 0) {
+    return Costs::ZeroCosts();
+  }
+  if (multiplier == 1) {
+    return costs;
+  }
+
+  Costs result = costs;
+  result.execution_time *= multiplier;
+  result.compute_time *= multiplier;
+  result.memory_time *= multiplier;
+  result.intermediate_memory_time *= multiplier;
+  result.intermediate_memory_read_time *= multiplier;
+  result.intermediate_memory_write_time *= multiplier;
+  if (result.max_memory != kMemoryUnknown) {
+    result.max_memory *= multiplier;
+  }
+  return result;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index 5876d6f..9815d3d 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -16,9 +16,6 @@
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
 
-#include <chrono>
-#include <cmath>
-#include <unordered_map>
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
@@ -204,6 +201,12 @@
   return costs;
 }
 
+Costs CombineCosts(const Costs& left, const Costs& right);
+
+// Multiplies Costs by a scalar.
+// Equivalent to applying CombineCosts "multiplier" times.
+Costs MultiplyCosts(const Costs& costs, int multiplier);
+
 // Given a GrapperItem and an optimized implementation of the corresponding
 // TensorFlow graph, the CostEstimator attempts to predicts the actual cost of
 // running the graph.
diff --git a/tensorflow/core/grappler/costs/cost_estimator_test.cc b/tensorflow/core/grappler/costs/cost_estimator_test.cc
new file mode 100644
index 0000000..62197a4
--- /dev/null
+++ b/tensorflow/core/grappler/costs/cost_estimator_test.cc
@@ -0,0 +1,88 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(CostEstimatorTest, CombineCosts) {
+  Costs c = Costs::ZeroCosts();
+  c.execution_time = Costs::NanoSeconds(1);
+  c.compute_time = Costs::NanoSeconds(2);
+  c.memory_time = Costs::NanoSeconds(3);
+  c.intermediate_memory_time = Costs::NanoSeconds(4);
+  c.intermediate_memory_read_time = Costs::NanoSeconds(5);
+  c.intermediate_memory_write_time = Costs::NanoSeconds(6);
+  c.max_memory = 1;
+  c.max_per_op_buffers = 2;
+  c.max_per_op_streaming = 3;
+  c.num_ops_total = 1;
+  c.inaccurate = false;
+  c.num_ops_with_unknown_shapes = 0;
+
+  Costs sum = CombineCosts(c, c);
+
+  EXPECT_EQ(sum.execution_time, Costs::NanoSeconds(2));
+  EXPECT_EQ(sum.compute_time, Costs::NanoSeconds(4));
+  EXPECT_EQ(sum.memory_time, Costs::NanoSeconds(6));
+  EXPECT_EQ(sum.intermediate_memory_time, Costs::NanoSeconds(8));
+  EXPECT_EQ(sum.intermediate_memory_read_time, Costs::NanoSeconds(10));
+  EXPECT_EQ(sum.intermediate_memory_write_time, Costs::NanoSeconds(12));
+  EXPECT_EQ(sum.max_memory, 2);
+  EXPECT_EQ(sum.max_per_op_buffers, 2);
+  EXPECT_EQ(sum.max_per_op_streaming, 3);
+  EXPECT_EQ(sum.num_ops_total, 2);
+  EXPECT_FALSE(sum.inaccurate);
+  EXPECT_EQ(sum.num_ops_with_unknown_shapes, 0);
+}
+
+TEST(CostEstimatorTest, MultiplyCosts) {
+  Costs c = Costs::ZeroCosts();
+  c.execution_time = Costs::NanoSeconds(1);
+  c.compute_time = Costs::NanoSeconds(2);
+  c.memory_time = Costs::NanoSeconds(3);
+  c.intermediate_memory_time = Costs::NanoSeconds(4);
+  c.intermediate_memory_read_time = Costs::NanoSeconds(5);
+  c.intermediate_memory_write_time = Costs::NanoSeconds(6);
+  c.max_memory = 1;
+  c.max_per_op_buffers = 2;
+  c.max_per_op_streaming = 3;
+  c.num_ops_total = 1;
+  c.inaccurate = false;
+  c.num_ops_with_unknown_shapes = 0;
+
+  Costs product = MultiplyCosts(c, 10);
+
+  EXPECT_EQ(product.execution_time, Costs::NanoSeconds(10));
+  EXPECT_EQ(product.compute_time, Costs::NanoSeconds(20));
+  EXPECT_EQ(product.memory_time, Costs::NanoSeconds(30));
+  EXPECT_EQ(product.intermediate_memory_time, Costs::NanoSeconds(40));
+  EXPECT_EQ(product.intermediate_memory_read_time, Costs::NanoSeconds(50));
+  EXPECT_EQ(product.intermediate_memory_write_time, Costs::NanoSeconds(60));
+  EXPECT_EQ(product.max_memory, 10);
+  EXPECT_EQ(product.max_per_op_buffers, 2);
+  EXPECT_EQ(product.max_per_op_streaming, 3);
+  EXPECT_EQ(product.num_ops_total, 1);
+  EXPECT_FALSE(product.inaccurate);
+  EXPECT_EQ(product.num_ops_with_unknown_shapes, 0);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index d549246..52c8f6f 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -34,41 +34,9 @@
 
 namespace tensorflow {
 namespace grappler {
+
 namespace {
 
-Costs CombineCosts(const Costs& left, const Costs& right) {
-  CHECK_NE(left.max_memory, kMemoryUnknown);
-  CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
-  CHECK_NE(left.max_per_op_streaming, kMemoryUnknown);
-
-  Costs result = left;
-  result.execution_time += right.execution_time;
-  result.compute_time += right.compute_time;
-  result.memory_time += right.memory_time;
-  result.intermediate_memory_time += right.intermediate_memory_time;
-
-  result.num_ops_total += right.num_ops_total;
-  if (right.inaccurate) result.inaccurate = true;
-  result.num_ops_with_unknown_shapes += right.num_ops_with_unknown_shapes;
-
-  if (right.max_memory != kMemoryUnknown) {
-    result.max_memory += right.max_memory;
-  }
-  if (right.max_per_op_buffers != kMemoryUnknown) {
-    result.max_per_op_buffers =
-        std::max(left.max_per_op_buffers, right.max_per_op_buffers);
-  }
-  if (right.max_per_op_streaming != kMemoryUnknown) {
-    result.max_per_op_streaming =
-        std::max(left.max_per_op_streaming, right.max_per_op_streaming);
-  }
-  VLOG(4) << "costs execution_time=" << result.execution_time.count()
-          << " max_memory=" << result.max_memory
-          << " max_per_op_buffers=" << result.max_per_op_buffers
-          << " max_per_op_streaming=" << result.max_per_op_streaming;
-  return result;
-}
-
 // Key to the cached _Recv ops map, and its hash and predicate structures.
 struct RecvNodeDescriptor {
   const NodeDef* node;
@@ -365,8 +333,7 @@
     name_to_node[node->name()] = node;
   }
 
-  // Traverse the graph to check if the graph is annotated with Switch outputs.
-  // Also record _Send nodes.
+  // Traverses the graph to record _Send nodes.
   // TODO(dyoon): Instead of identifying _Send node here manually, add _Send
   // to _Recv as control dependency when creating GrapplerItem.
   std::unordered_map<string, const NodeDef*> name_to_send;
@@ -375,11 +342,6 @@
       const auto& attr = node.attr();
       name_to_send[attr.at("tensor_name").s()] = &node;
     }
-
-    if (IsSwitch(node)) {
-      const auto& attr = node.attr();
-      if (attr.count(kOutputSlots) > 0) switch_outputs_annotated_ = true;
-    }
   }
 
   // To reuse _Recv ops.
@@ -741,66 +703,29 @@
   return it->second;
 }
 
-// Check Switch outputs in updated MetaGraphDef, add corresponding nodes to
-// ready queue.
-// Fallback to add all outputs if fail to find the actual output.
-bool VirtualScheduler::AddSwitchOutputsToReadyQueue(
-    const NodeDef* node, int curr_iter, const Costs::Duration& curr_time) {
-  if (node->attr().count(kOutputSlots) == 0) return false;
-
-  auto& node_state = node_map_[node];
-  const auto& slot_vector = node->attr().at(kOutputSlots);
-  if (slot_vector.list().i_size() <= curr_iter) {
-    // Sometimes we encounter infinite loop. Fall back to add all outputs.
-    return false;
-  }
-
-  int slot = slot_vector.list().i(curr_iter);
-  for (const auto& port_num_output_pair : node_state.outputs) {
-    if (port_num_output_pair.first != slot) continue;
-
-    for (auto* output_node : port_num_output_pair.second) {
-      auto& output_state = node_map_[output_node];
-      output_state.num_inputs_ready++;
-      // Execute a node as soon as all its inputs are ready. Merge nodes
-      // are special since they run as soon as one of their inputs becomes
-      // available.
-      if (output_state.num_inputs_ready == output_state.inputs.size() ||
-          IsMerge(*output_node)) {
-        // This output node is now ready.
-        output_state.time_ready = curr_time;
-        ready_nodes_->AddNode(output_node);
-        VLOG(3) << "Node " << node->name() << " iter " << curr_iter << "/"
-                << slot_vector.list().i_size() << " Add Switch output " << slot
-                << ": " << output_node->name();
-      }
-    }
-    return true;
-  }
-
-  return false;
-}
-
 void VirtualScheduler::AddOutputNodesToReadyQueue(
     const NodeDef* node, const Costs::Duration& curr_time) {
-  auto& node_state = node_map_[node];
-  int curr_iter = node_state.num_executed_times;
-  ++node_state.num_executed_times;
-
-  if (switch_outputs_annotated_) {
-    // If the graph is annotated with StepStats, reset num_inputs_ready so we
-    // can schedule the node multiple times.
-    node_state.num_inputs_ready = 0;
-
-    // For Switch node, get output branch from updated MetaGraphDef.
-    if (IsSwitch(*node) &&
-        AddSwitchOutputsToReadyQueue(node, curr_iter, curr_time))
-      return;
+  // Checks whether the Switch's output slots change over iterations.
+  int slot = -1;
+  if (IsSwitch(*node) && node->attr().count(kOutputSlots) > 0 &&
+      node->attr().at(kOutputSlots).list().i_size() > 0) {
+    slot = node->attr().at(kOutputSlots).list().i(0);
+    for (int i = 1; i < node->attr().at(kOutputSlots).list().i_size(); ++i) {
+      if (slot != node->attr().at(kOutputSlots).list().i(i)) {
+        slot = -1;
+        break;
+      }
+    }
   }
 
   // Increment num_inputs_ready of the output nodes and maybe add to ready
   // nodes.
+  auto& node_state = node_map_[node];
   for (const auto& port_num_output_pair : node_state.outputs) {
+    // If Switch is annotated and its output slots are always the same, we only
+    // schedule the slot that was executed. Otherwise, scheduler both slots.
+    if (slot >= 0 && port_num_output_pair.first != slot) continue;
+
     for (auto* output_node : port_num_output_pair.second) {
       auto& output_state = node_map_[output_node];
       output_state.num_inputs_ready++;
@@ -812,6 +737,7 @@
         // This output node is now ready.
         output_state.time_ready = curr_time;
         ready_nodes_->AddNode(output_node);
+        VLOG(3) << "  Add output: " << output_node->name();
       }
     }
   }
@@ -819,12 +745,20 @@
 
 bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Update graph_costs_ and per-op costs.
-  graph_costs_ = CombineCosts(graph_costs_, node_costs);
   const NodeDef* node = ready_nodes_->GetCurrNode();
+  auto& node_state = node_map_[node];
+  // If there is annotation in the graph about execution times, we use that
+  // number, otherwise, we assume the node is executed once.
+  node_state.execution_count = node->attr().count(kExecutionCount) == 0
+                                   ? 1
+                                   : node->attr().at(kExecutionCount).i();
+  Costs total_node_costs =
+      MultiplyCosts(node_costs, node_state.execution_count);
+  graph_costs_ = CombineCosts(graph_costs_, total_node_costs);
   const string& op_name = node->op();
 
   auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
-  op_cost = CombineCosts(op_cost, node_costs);
+  op_cost = CombineCosts(op_cost, total_node_costs);
 
   if (VLOG_IS_ON(2)) {
     // Also keep track of op counts and costs per op (with their shapes).
@@ -838,21 +772,16 @@
   }
 
   // Update node and device states.
-  auto& node_state = node_map_[node];
   auto& device = device_[node_state.device_name];
   device.nodes_executed.push_back(node);
   // Node is scheduled when the device is available AND all the inputs are
   // ready; hence, time_scheduled is time_ready if time_ready > device curr
   // time.
-  // TODO(andiryxu): Current node_state result only records the last execution.
-  // With annotated MetaGraph we can schedule a node for multiple times.
-  // Refine NodeState structure accordingly, e.g. record time_scheduled in a
-  // vector.
   node_state.time_scheduled =
       std::max(device.GetCurrTime(), node_state.time_ready);
   // Override device curr time with the time_scheduled.
   device.device_costs.execution_time = node_state.time_scheduled;
-  device.device_costs = CombineCosts(device.device_costs, node_costs);
+  device.device_costs = CombineCosts(device.device_costs, total_node_costs);
   auto curr_time = device.GetCurrTime();
   node_state.time_finished = curr_time;
 
@@ -865,7 +794,8 @@
         node_state.time_no_references[port_num] = curr_time;
       } else {
         device.memory_usage +=
-            CalculateOutputSize(node_state.output_properties, port_num);
+            CalculateOutputSize(node_state.output_properties, port_num) *
+            node_state.execution_count;
         device.nodes_in_memory.insert(std::make_pair(node, port_num));
       }
     }
@@ -873,15 +803,16 @@
 
   // Update device's per-op cost.
   auto& device_op_cost = FindOrCreateZero(op_name, &device.op_to_cost);
-  device_op_cost = CombineCosts(device_op_cost, node_costs);
+  device_op_cost = CombineCosts(device_op_cost, total_node_costs);
 
   VLOG(3) << "Op scheduled -- name: " << node->name() << ", op: " << node->op()
           << ", device: " << node->device()
+          << ", execution_count: " << node_state.execution_count
           << ", ready: " << node_state.time_ready.count()
           << ", scheduled: " << node_state.time_scheduled.count()
           << ", finished: " << node_state.time_finished.count();
 
-  // Check outputs, add ready nodes to queue.
+  // Checks outputs, and adds ready nodes to queue.
   AddOutputNodesToReadyQueue(node, curr_time);
 
   // Increment num_outputs_executed of the input nodes and maybe update memory.
@@ -898,7 +829,8 @@
       input_state.time_no_references[port] = curr_time;
       auto& input_device = device_[input_state.device_name];
       input_device.memory_usage -=
-          CalculateOutputSize(input_state.output_properties, port);
+          CalculateOutputSize(input_state.output_properties, port) *
+          node_state.execution_count;
 
       input_device.nodes_in_memory.erase(std::make_pair(input, port));
     }
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index cceca71..e8e1622 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -71,14 +71,14 @@
   // time_no_references.
 
   // How many times this node has been executed, e.g. in a while loop.
-  int num_executed_times;
+  int execution_count;
 
   NodeState() {
     num_inputs_ready = 0;
     time_ready = Costs::Duration::max();
     time_scheduled = Costs::Duration::max();
     time_finished = Costs::Duration::max();
-    num_executed_times = 0;
+    execution_count = 0;
     // Note that num_outputs_executed and time_no_references are not initialized
     // here, since we don't know the size (i.e., # outputs for this node).
   }
@@ -323,8 +323,6 @@
                           std::map<string, Costs>* op_cost);
   float Round2(const float x) const;
   bool IsPersistentNode(const NodeDef* node) const;
-  bool AddSwitchOutputsToReadyQueue(const NodeDef* node, int curr_iter,
-                                    const Costs::Duration& curr_time);
   void AddOutputNodesToReadyQueue(const NodeDef* node,
                                   const Costs::Duration& curr_time);
 
@@ -358,10 +356,6 @@
   bool track_mem_usage_snapshot_;
   const bool use_aggressive_shape_inference_;
 
-  // Whether the input graph includes Switch nodes annotated with output slots
-  // information.
-  bool switch_outputs_annotated_ = false;
-
   VirtualPlacer placer_;  // owned.
 };
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 3b48263..38fd380 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -873,8 +873,8 @@
     grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
   }
 
-  // A simple while loop strengthened with Switch outputs.
-  void CreateGrapplerItemWithLoopSwitchOutputs() {
+  // A simple while loop strengthened with Switch outputs xxx.
+  void CreateGrapplerItemWithLoopAnnotated() {
     // Test graph produced in python using:
     /*
       with tf.Graph().as_default():
@@ -909,6 +909,12 @@
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "ones"
@@ -936,6 +942,12 @@
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "while/Enter"
@@ -965,6 +977,12 @@
       i: 10
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "while/Enter_1"
@@ -994,6 +1012,12 @@
       i: 10
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "while/Merge"
@@ -1012,6 +1036,12 @@
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Merge_1"
@@ -1030,6 +1060,12 @@
       type: DT_FLOAT
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Less/y"
@@ -1052,6 +1088,12 @@
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Less"
@@ -1064,11 +1106,23 @@
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/LoopCond"
   op: "LoopCond"
   input: "while/Less"
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Switch"
@@ -1090,6 +1144,12 @@
     }
   }
   attr {
+    key: "_execution_count"
+    value {
+      i: 11
+    }
+  }
+  attr {
     key: "_output_slot_vector"
     value {
       list {
@@ -1128,6 +1188,12 @@
     }
   }
   attr {
+    key: "_execution_count"
+    value {
+      i: 11
+    }
+  }
+  attr {
     key: "_output_slot_vector"
     value {
       list {
@@ -1156,6 +1222,12 @@
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Identity_1"
@@ -1167,6 +1239,12 @@
       type: DT_FLOAT
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/add/y"
@@ -1189,6 +1267,12 @@
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/add"
@@ -1201,6 +1285,12 @@
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/concat/axis"
@@ -1223,6 +1313,12 @@
       }
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/concat"
@@ -1248,6 +1344,12 @@
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/NextIteration"
@@ -1259,6 +1361,12 @@
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/NextIteration_1"
@@ -1270,6 +1378,12 @@
       type: DT_FLOAT
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 10
+    }
+  }
 }
 node {
   name: "while/Exit"
@@ -1281,6 +1395,12 @@
       type: DT_INT32
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 node {
   name: "while/Exit_1"
@@ -1292,6 +1412,12 @@
       type: DT_FLOAT
     }
   }
+  attr {
+    key: "_execution_count"
+    value {
+      i: 1
+    }
+  }
 }
 versions {
   producer: 21
@@ -1305,6 +1431,115 @@
     grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
   }
 
+  // A simple condition graph.
+  void CreateGrapplerItemWithCondition() {
+    // Handcrafted test graph: a/Less -> Switch -> First/Second -> Merge.
+    const string gdef_ascii = R"EOF(
+node {
+  name: "a"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 2.0
+      }
+    }
+  }
+}
+node {
+  name: "Less"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        tensor_content: "\001"
+      }
+    }
+  }
+}
+node {
+  name: "Switch"
+  op: "Switch"
+  input: "a"
+  input: "Less"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "First"
+  op: "Identity"
+  input: "Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Second"
+  op: "Identity"
+  input: "Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Merge"
+  op: "Merge"
+  input: "First"
+  input: "Second"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 27
+})EOF";
+
+    grappler_item_.reset(new GrapplerItem);
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii,
+                                                &grappler_item_->graph));
+    grappler_item_->id = "test_graph";
+    grappler_item_->fetch = {"Merge"};
+  }
+
   // Create a FusedBatchNorm op that has multiple output ports.
   void CreateGrapplerItemWithInterDeviceTransfers() {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
@@ -2379,87 +2614,155 @@
   ValidateDependencyChain(start_times, {"while/Switch_1", "while/Exit_1"});
 }
 
-TEST_F(VirtualSchedulerTest, WhileLoopWithSwitchOutputs) {
-  // Init.
-  CreateGrapplerItemWithLoopSwitchOutputs();
-  InitScheduler();
+TEST_F(VirtualSchedulerTest, AnnotatedWhileLoop) {
+  {
+    // Init.
+    CreateGrapplerItemWithLoop();
+    InitScheduler();
 
-  // Runs the scheduler.
-  RunScheduler("");
+    // Runs the scheduler.
+    RunScheduler("");
+    Costs c = scheduler_->Summary();
 
-  RunMetadata metadata;
-  scheduler_->Summary(&metadata);
-
-  // Nodes in topological order:
-  // * const, ones
-  // * while/Enter, while/Enter_1
-  // * while/Merge, while/Merge_1
-  // * while/Less/y
-  // * while/Less
-  // * while/LoopCond
-  // * while/Switch, while/Switch_1
-  // * while/Identity, while/Identity_1, while/Exit, while/Exit_1
-  // * while/add/y, while/concat/axis
-  // * while/add, while/concat
-  // * while/NextIteration, while/NextIteration_1
-
-  int num_next_iteration = 0;
-  int num_next_iteration_1 = 0;
-  int num_exit = 0;
-  int num_exit_1 = 0;
-  int64 next_iter_start_micro;
-  int64 next_iter_1_start_micro;
-  int64 exit_start_micro;
-  int64 exit_1_start_micro;
-
-  std::unordered_map<string, int64> start_times;
-  for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
-    for (const auto& stats : device_step_stats.node_stats()) {
-      start_times[stats.node_name()] = stats.all_start_micros();
-      if (stats.node_name() == "while/NextIteration") {
-        ++num_next_iteration;
-        next_iter_start_micro = stats.all_start_micros();
-      } else if (stats.node_name() == "while/NextIteration_1") {
-        ++num_next_iteration_1;
-        next_iter_1_start_micro = stats.all_start_micros();
-      } else if (stats.node_name() == "while/Exit") {
-        ++num_exit;
-        exit_start_micro = stats.all_start_micros();
-      } else if (stats.node_name() == "while/Exit_1") {
-        ++num_exit_1;
-        exit_1_start_micro = stats.all_start_micros();
-      }
-    }
+    EXPECT_EQ(23, c.execution_time.asMicroSeconds().count());
+    // Both while/Merge and while/Merge_1 are scheduled twice.
+    EXPECT_EQ(grappler_item_->graph.node_size() + 2, c.num_ops_total);
+    EXPECT_FALSE(c.inaccurate);
+    EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
   }
 
-  // Makes sure we run the loop body for ten times.
-  EXPECT_EQ(10, num_next_iteration);
-  EXPECT_EQ(10, num_next_iteration_1);
-  EXPECT_EQ(1, num_exit);
-  EXPECT_EQ(1, num_exit_1);
+  {
+    // Init.
+    CreateGrapplerItemWithLoopAnnotated();
+    InitScheduler();
 
-  // Start times of while/NextIteration and while/NextIteration_1 should be
-  // different, so should be those of while/Exit and while/Exit_1.
-  EXPECT_NE(next_iter_start_micro, next_iter_1_start_micro);
-  EXPECT_NE(exit_start_micro, exit_1_start_micro);
+    // Runs the scheduler.
+    RunScheduler("");
+    Costs c = scheduler_->Summary();
 
-  // Checks dependency among the nodes; no matter what scheduling mechanism we
-  // use, the scheduled ops should follow these dependency chains.
-  // We have to break the loop into two parts, identified by Switch outputs.
-  ValidateDependencyChain(
-      start_times,
-      {"Const", "while/Enter", "while/Merge", "while/Less/y", "while/Less",
-       "while/LoopCond", "while/Switch", "while/Exit"});
-  ValidateDependencyChain(start_times, {"while/Identity", "while/add/y",
-                                        "while/add", "while/NextIteration"});
-  ValidateDependencyChain(
-      start_times, {"ones", "while/Enter_1", "while/Merge_1", "while/Switch_1",
-                    "while/Exit_1"});
-  ValidateDependencyChain(start_times, {"while/Identity_1", "while/concat",
-                                        "while/NextIteration_1"});
-  ValidateDependencyChain(
-      start_times, {"while/Identity", "while/concat/axis", "while/concat"});
-  ValidateDependencyChain(start_times, {"while/Identity", "while/add"});
+    // The costs for Merge is accumulated twice for execution_count times, but
+    // since Merge's cost is minimal, we keep this behavior here.
+    EXPECT_EQ(178, c.execution_time.asMicroSeconds().count());
+    // Both while/Merge and while/Merge_1 are scheduled twice.
+    EXPECT_EQ(grappler_item_->graph.node_size() + 2, c.num_ops_total);
+    EXPECT_FALSE(c.inaccurate);
+    EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
+  }
+}
+
+TEST_F(VirtualSchedulerTest, Condition) {
+  // Without annotation.
+  {
+    // Inits.
+    CreateGrapplerItemWithCondition();
+    InitScheduler();
+
+    // Runs the scheduler.
+    RunScheduler("");
+    RunMetadata metadata;
+    Costs c = scheduler_->Summary(&metadata);
+
+    // Nodes in topological order: a/Less, Switch, First/Second, Merge.
+    int num_a = 0;
+    int num_less = 0;
+    int num_switch = 0;
+    int num_first = 0;
+    int num_second = 0;
+    int num_merge = 0;
+
+    for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
+      for (const auto& stats : device_step_stats.node_stats()) {
+        if (stats.node_name() == "a") {
+          ++num_a;
+        } else if (stats.node_name() == "Less") {
+          ++num_less;
+        } else if (stats.node_name() == "Switch") {
+          ++num_switch;
+        } else if (stats.node_name() == "First") {
+          ++num_first;
+        } else if (stats.node_name() == "Second") {
+          ++num_second;
+        } else if (stats.node_name() == "Merge") {
+          ++num_merge;
+        }
+      }
+    }
+
+    EXPECT_EQ(1, num_a);
+    EXPECT_EQ(1, num_less);
+    EXPECT_EQ(1, num_switch);
+    EXPECT_EQ(1, num_first);
+    EXPECT_EQ(1, num_second);
+    EXPECT_EQ(2, num_merge);
+
+    EXPECT_EQ(7, c.execution_time.asMicroSeconds().count());
+    // Merge is executed twice.
+    EXPECT_EQ(grappler_item_->graph.node_size() + 1, c.num_ops_total);
+    EXPECT_FALSE(c.inaccurate);
+    EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
+  }
+
+  // With annotation.
+  {
+    // Inits.
+    CreateGrapplerItemWithCondition();
+
+    // Annotates the Switch node.
+    for (auto& node : *grappler_item_->graph.mutable_node()) {
+      if (node.name() == "Switch") {
+        AttrValue attr_output_info;
+        // Adds one output slot 0 so that Second shouldn't be executed.
+        (*attr_output_info.mutable_list()).add_i(0);
+        AddNodeAttr(kOutputSlots, attr_output_info, &node);
+      }
+    }
+
+    InitScheduler();
+
+    // Runs the scheduler.
+    RunScheduler("");
+    RunMetadata metadata;
+    Costs c = scheduler_->Summary(&metadata);
+
+    // Nodes in topological order: a/Less, Switch, Merge
+    int num_a = 0;
+    int num_less = 0;
+    int num_switch = 0;
+    int num_first = 0;
+    int num_second = 0;
+    int num_merge = 0;
+
+    for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
+      for (const auto& stats : device_step_stats.node_stats()) {
+        if (stats.node_name() == "a") {
+          ++num_a;
+        } else if (stats.node_name() == "Less") {
+          ++num_less;
+        } else if (stats.node_name() == "Switch") {
+          ++num_switch;
+        } else if (stats.node_name() == "First") {
+          ++num_first;
+        } else if (stats.node_name() == "Second") {
+          ++num_second;
+        } else if (stats.node_name() == "Merge") {
+          ++num_merge;
+        }
+      }
+    }
+
+    EXPECT_EQ(1, num_a);
+    EXPECT_EQ(1, num_less);
+    EXPECT_EQ(1, num_switch);
+    EXPECT_EQ(1, num_first);
+    EXPECT_EQ(0, num_second);
+    EXPECT_EQ(1, num_merge);
+
+    EXPECT_EQ(5, c.execution_time.asMicroSeconds().count());
+    // Second is not executed.
+    EXPECT_EQ(grappler_item_->graph.node_size() - 1, c.num_ops_total);
+    EXPECT_FALSE(c.inaccurate);
+    EXPECT_EQ(0, c.num_ops_with_unknown_shapes);
+  }
 }
 
 TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index 6a5e60e..1200cff 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -639,6 +639,9 @@
   swap_names();
 
   // Swap controlling fanouts.
+  //
+  // Note: To and from control fanout iterators are still valid as no mutations
+  // has been performed on fanouts().
   SwapFanoutsMapValues(&fanouts(), from_control, from_control_fanouts,
                        to_control, to_control_fanouts);
 
@@ -706,6 +709,9 @@
     if (to_is_switch) {
       dedup_switch_control(from_node);
     } else {
+      // Fetch iterator again as the original iterator might have been
+      // invalidated by container rehash triggered due to mutations.
+      auto from_control_fanouts = fanouts().find(from_control);
       dedup_control_fanouts(from_node, from_control_fanouts);
     }
   }
@@ -713,6 +719,9 @@
     if (from_is_switch) {
       dedup_switch_control(to_node);
     } else {
+      // Fetch iterator again as the original iterator might have been
+      // invalidated by container rehash triggered due to mutations.
+      auto to_control_fanouts = fanouts().find(to_control);
       dedup_control_fanouts(to_node, to_control_fanouts);
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index ce5a21b..5b2f1e5 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -1159,8 +1159,7 @@
   FunctionLibraryRuntime* flr = ctx->mutable_function_library_runtime();
 
   // 1. Inline symbolic gradient node.
-  const InlineFunctionBodyOptions default_inline_opts;
-  const bool expanded = ExpandInlineFunctions(flr, &graph, default_inline_opts);
+  const bool expanded = ExpandInlineFunctions(flr, &graph);
   if (!expanded) {
     return errors::Internal("Failed to expand SymbolicGradient op");
   }
@@ -1182,7 +1181,7 @@
 
   // 2. Recursively inline nested function calls.
   int iteration = 0;
-  while (ExpandInlineFunctions(flr, &graph, default_inline_opts)) {
+  while (ExpandInlineFunctions(flr, &graph)) {
     if (++iteration >= 50) {
       VLOG(2) << "Break symbolic gradient inlining loop at iteration #"
               << iteration;
@@ -1547,8 +1546,7 @@
     const Device* default_device =
         devices->FindDeviceByName(func_node.device());
 
-    Placer placer(func_body_graph.get(), devices,
-                  nullptr /* No session options */, default_device);
+    Placer placer(func_body_graph.get(), devices, default_device);
     TF_RETURN_IF_ERROR(placer.Run());
   }
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 6c05a39..a158f68 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -765,11 +765,19 @@
     ],
     deps = [
         ":eigen_contraction_kernel",
+        ":eigen_spatial_convolutions-inl",
         "//third_party/eigen3",
     ],
 )
 
 cc_library(
+    name = "eigen_spatial_convolutions-inl",
+    hdrs = [
+        "eigen_spatial_convolutions-inl.h",
+    ],
+)
+
+cc_library(
     name = "image_resizer_state",
     hdrs = ["image_resizer_state.h"],
     visibility = ["//visibility:private"],
@@ -5418,6 +5426,7 @@
         "eigen_pooling.h",
         "eigen_softmax.h",
         "eigen_spatial_convolutions.h",
+        "eigen_spatial_convolutions-inl.h",
         "eigen_volume_patch.h",
         "fifo_queue.h",
         "maxpooling_op.h",
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 43539ac..88e6e62 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -52,20 +52,15 @@
 
 namespace {
 
+// Returns the pair of dimensions along which to perform Tensor contraction to
+// emulate matrix multiplication.
+// For matrix multiplication of 2D Tensors X and Y, X is contracted along
+// second dimension and Y is contracted along the first dimension (if neither X
+// nor Y is adjointed). The dimension to contract along is switched when any
+// operand is adjointed.
+// See http://en.wikipedia.org/wiki/Tensor_contraction
 Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x, bool adj_y) {
-  if (!adj_x) {
-    if (!adj_y) {
-      return Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
-    } else {
-      return Eigen::IndexPair<Eigen::DenseIndex>(1, 1);
-    }
-  } else {
-    if (!adj_y) {
-      return Eigen::IndexPair<Eigen::DenseIndex>(0, 0);
-    } else {
-      return Eigen::IndexPair<Eigen::DenseIndex>(0, 1);
-    }
-  }
+  return Eigen::IndexPair<Eigen::DenseIndex>(adj_x ? 0 : 1, adj_y ? 1 : 0);
 }
 
 // Parallel batch matmul kernel based on the multi-threaded tensor contraction
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index c34ea14..609ddd6 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -127,8 +127,12 @@
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {2}, 2, var.shape(), &dv));
     Tensor* db = nullptr;
-    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {3}, 3, mean.shape(), &db));
+    if (scale_after_normalization_) {
+      OP_REQUIRES_OK(context, context->allocate_output(3, mean.shape(), &db));
+    } else {
+      OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                  {3}, 3, mean.shape(), &db));
+    }
     Tensor* dg = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dg));
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 852b68e..de19da7 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -404,6 +404,26 @@
     ],
 )
 
+tf_cc_test(
+    name = "repeat_dataset_op_test",
+    size = "small",
+    srcs = ["repeat_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":repeat_dataset_op",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "take_dataset_op",
     srcs = ["take_dataset_op.cc"],
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc b/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc
index ba3c59b..1885c50 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op_test.cc
@@ -45,12 +45,12 @@
 
   // Creates a new ConcatenateDataset op kernel.
   Status CreateConcatenateDatasetKernel(
-      const DataTypeVector &output_tyeps,
+      const DataTypeVector &output_types,
       const std::vector<PartialTensorShape> &output_shapes,
       std::unique_ptr<OpKernel> *op_kernel) {
     node_def_ = test::function::NDef(
         kNodeName, kOpName, {"input_dataset", "another_dataset"},
-        {{"output_types", output_tyeps}, {"output_shapes", output_shapes}});
+        {{"output_types", output_types}, {"output_shapes", output_shapes}});
     TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, op_kernel));
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op_test.cc b/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
new file mode 100644
index 0000000..61f314c
--- /dev/null
+++ b/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
@@ -0,0 +1,560 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "repeat_dataset";
+constexpr char kOpName[] = "RepeatDataset";
+
+class RepeatDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates `TensorSliceDataset` variant tensor from the input vector of
+  // tensors.
+  Status CreateTensorSliceDatasetTensor(
+      std::vector<Tensor> *const tensor_vector, Tensor *dataset_tensor) {
+    DatasetBase *tensor_slice_dataset;
+    TF_RETURN_IF_ERROR(CreateTensorSliceDataset(
+        "tensor_slice_node", tensor_vector, &tensor_slice_dataset));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(tensor_slice_dataset, dataset_tensor));
+    return Status::OK();
+  }
+
+  // Creates a new `RepeatDataset` op kernel.
+  Status CreateRepeatDatasetKernel(
+      const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      std::unique_ptr<OpKernel> *op_kernel) {
+    node_def_ = test::function::NDef(
+        kNodeName, kOpName, {"input_dataset", "count"},
+        {{"output_types", output_types}, {"output_shapes", output_shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, op_kernel));
+    return Status::OK();
+  }
+
+  // Create a new `RepeatDataset` op kernel context.
+  Status CreateRepeatDatasetContext(
+      OpKernel *op_kernel, gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct TestCase {
+  std::vector<Tensor> input_tensors;
+  int64 count;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector expected_output_dtypes;
+  std::vector<PartialTensorShape> expected_output_shapes;
+  int64 expected_cardinality;
+  std::vector<int> breakpoints;
+};
+
+TestCase FiniteRepeatTestCase() {
+  return {
+      /*input_tensors*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {1, 2, 3, 4}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{2, 1}, {"a", "b"})},
+      /*count*/ 2,
+      /*expected_outputs*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"a"}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {3, 4}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"b"}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {1, 2}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"a"}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {3, 4}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{1}, {"b"})},
+      /*expected_output_dtypes*/ {DT_INT64, DT_STRING},
+      /*expected_output_shapes*/
+      {PartialTensorShape({2}), PartialTensorShape({1})},
+      /*expected_cardinality*/ 4,
+      /*breakpoints*/ {0, 1, 3}};
+}
+
+TestCase EmptyRepeatTestCase() {
+  return {
+      /*input_tensors*/
+      {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 2}, {1, 2, 3, 4}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape{2, 1}, {"a", "b"})},
+      /*count*/ 0,
+      /*expected_outputs*/
+      {},
+      /*expected_output_dtypes*/ {DT_INT64, DT_STRING},
+      /*expected_output_shapes*/
+      {PartialTensorShape({2}), PartialTensorShape({1})},
+      /*expected_cardinality*/ 0,
+      /*breakpoints*/ {0, 1, 3}};
+}
+
+TestCase ForeverRepeatTestCase() {
+  return {/*input_tensors*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2, 1}, {1, 2})},
+          /*count*/ -1,
+          /*expected_outputs*/
+          // Use the first group of the repeated tensors to represent the
+          // infinite outputs.
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{1}, {2})},
+          /*expected_output_dtypes*/ {DT_INT64},
+          /*expected_output_shapes*/ {PartialTensorShape({1})},
+          /*expected_cardinality*/ -1,
+          /*breakpoints*/ {0, 1, 3}};
+}
+
+class ParameterizedDatasetTest
+    : public RepeatDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedDatasetTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+
+  if (test_case.count < 0) {
+    // We test only a finite number of steps of the infinite sequence.
+    for (int i = 0; i < 100; ++i) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      for (const auto &tensor : out_tensors) {
+        TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+        expected_outputs_it++;
+        // In the forever-repeat test case, the first group of the repeated
+        // tensors is used to represent the expected outputs, so the iterator
+        // of the expected outputs needs to be reset once it reaches the end.
+        if (expected_outputs_it == test_case.expected_outputs.end()) {
+          expected_outputs_it = test_case.expected_outputs.begin();
+        }
+      }
+    }
+    EXPECT_FALSE(end_of_sequence);
+  } else {
+    while (!end_of_sequence) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      if (!end_of_sequence) {
+        for (const auto &tensor : out_tensors) {
+          EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+          TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+          expected_outputs_it++;
+        }
+      }
+    }
+    EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+  }
+}
+
+TEST_F(RepeatDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  const TestCase &test_case = FiniteRepeatTestCase();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  EXPECT_EQ(repeat_dataset->type_string(), kOpName);
+}
+
+TEST_P(ParameterizedDatasetTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+  TF_EXPECT_OK(VerifyTypesMatch(repeat_dataset->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedDatasetTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+  TF_EXPECT_OK(VerifyShapesCompatible(repeat_dataset->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedDatasetTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  EXPECT_EQ(repeat_dataset->Cardinality(), GetParam().expected_cardinality);
+}
+
+TEST_F(RepeatDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = FiniteRepeatTestCase();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(repeat_dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
+                                test_case.expected_output_dtypes));
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      test_case.expected_output_shapes));
+}
+
+TEST_P(ParameterizedDatasetTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  if (test_case.count < 0) {
+    EXPECT_EQ(iterator->prefix(), "Iterator::ForeverRepeat");
+  } else if (test_case.count == 0) {
+    EXPECT_EQ(iterator->prefix(), "Iterator::EmptyRepeat");
+  } else {
+    EXPECT_EQ(iterator->prefix(), "Iterator::FiniteRepeat");
+  }
+}
+
+TEST_P(ParameterizedDatasetTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+  const TestCase &test_case = GetParam();
+  auto expected_outputs_it = test_case.expected_outputs.begin();
+  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
+  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
+  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
+                                              &tensor_slice_dataset_tensor));
+  Tensor count = CreateTensor<int64>(TensorShape{}, {test_case.count});
+  gtl::InlinedVector<TensorValue, 4> inputs_for_repeat_dataset;
+  inputs_for_repeat_dataset.emplace_back(&tensor_slice_dataset_tensor);
+  inputs_for_repeat_dataset.emplace_back(&count);
+
+  std::unique_ptr<OpKernel> repeat_dataset_kernel;
+  TF_ASSERT_OK(CreateRepeatDatasetKernel(test_case.expected_output_dtypes,
+                                         test_case.expected_output_shapes,
+                                         &repeat_dataset_kernel));
+  std::unique_ptr<OpKernelContext> repeat_dataset_context;
+  TF_ASSERT_OK(CreateRepeatDatasetContext(repeat_dataset_kernel.get(),
+                                          &inputs_for_repeat_dataset,
+                                          &repeat_dataset_context));
+  DatasetBase *repeat_dataset;
+  TF_ASSERT_OK(CreateDataset(repeat_dataset_kernel.get(),
+                             repeat_dataset_context.get(), &repeat_dataset));
+  core::ScopedUnref scoped_unref(repeat_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(
+      CreateIteratorContext(repeat_dataset_context.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      repeat_dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  bool end_of_sequence = repeat_dataset->Cardinality() == 0;
+  std::vector<Tensor> out_tensors;
+  int cur_iteration = 0;
+  std::vector<int> breakpoints = GetParam().breakpoints;
+  for (int breakpoint : breakpoints) {
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_EXPECT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_EXPECT_OK(iterator->Restore(iterator_ctx.get(), &reader));
+
+    while (cur_iteration < breakpoint) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      if (!end_of_sequence) {
+        for (auto &tensor : out_tensors) {
+          EXPECT_NE(expected_outputs_it, test_case.expected_outputs.end());
+          TF_EXPECT_OK(ExpectEqual(tensor, *expected_outputs_it));
+          expected_outputs_it++;
+        }
+      }
+      cur_iteration++;
+      if (test_case.count < 0 &&
+          expected_outputs_it == test_case.expected_outputs.end()) {
+        expected_outputs_it = test_case.expected_outputs.begin();
+      }
+    }
+
+    if (breakpoint >= repeat_dataset->Cardinality()) {
+      if (test_case.count < 0) {
+        EXPECT_FALSE(end_of_sequence);
+      } else {
+        EXPECT_TRUE(end_of_sequence);
+        EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end());
+      }
+    } else {
+      EXPECT_FALSE(end_of_sequence);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(RepeatDatasetOpTest, ParameterizedDatasetTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {FiniteRepeatTestCase(), EmptyRepeatTestCase(),
+                              ForeverRepeatTestCase()})));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
new file mode 100644
index 0000000..a2afab4
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
@@ -0,0 +1,1496 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
+
+// Note this header is used in both TF and TFLite.
+namespace Eigen {
+
+namespace internal {
+
+// WARNING: Most of the code here implicitly assumes that the matrix is in
+// ColMajor layout. This is guaranteed by the tensor contraction (see
+// TensorContraction.h).
+//
+// Inside Eigen a tensor contraction is represented by a matrix multiplication.
+// We don't want to actually extract image patches and reshape the result into
+// a matrix (this involves allocating huge extra memory), so the patch
+// extraction and reshape operations are implicit.
+//
+// TensorContractionInputMapper takes a matrix index and returns the coefficient
+// (or the packet) of the "virtual tensor", that would be at that index if we
+// were to actually reshape the result of patch extraction.
+//
+// TensorContractionSubMapper provides a similar view into the "virtual matrix"
+// at the given vertical and horizontal offsets.
+//
+// "Virtual matrix" dimensions:
+//   *0: kernelChannels * kernelRows * kernelCols;
+//    1: out_height * out_width; * OTHERS (e.g batches, etc...)
+//
+// *) extracted patches are continuous in memory (innermost dimension assuming
+//    col major layout)
+//
+// With this dimensions:
+//   row - offset within a single patch (in code: patchId)
+//   col - index of the extracted patch (in code: patchIndex)
+//         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
+//
+// TODO(ezhulenev): Consolidate this part of the code with the image patch
+// extraction code since they are both very similar.
+
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar_, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<
+    Scalar_, Index, Side,
+    TensorEvaluator<
+        const TensorReshapingOp<NewDimension,
+                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
+        Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+    inner_dim_reordered, Alignment> {
+ public:
+  typedef Scalar_ Scalar;
+
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      Self;
+
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper VectorMapper;
+  typedef SubMapper LinearMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(
+      const TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>& tensor,
+      const nocontract_t&, const nocontract_t&, const contract_t&,
+      const contract_t&)
+      : m_impl(tensor.impl().impl()) {
+    Index patch_rows;
+    Index patch_depth;
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      patch_depth = tensor.impl().dimensions()[0];
+      patch_rows = tensor.impl().dimensions()[1];
+      m_patch_cols = tensor.impl().dimensions()[2];
+      m_num_patches = tensor.impl().dimensions()[3];
+    } else {
+      const size_t NumDims = tensor.impl().dimensions().size();
+      patch_depth = tensor.impl().dimensions()[NumDims - 1];
+      patch_rows = tensor.impl().dimensions()[NumDims - 2];
+      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
+      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
+    }
+
+    // Strides for navigating through the single patch.
+    m_patch_row_stride = patch_depth;
+    m_patch_col_stride = patch_rows * m_patch_row_stride;
+
+    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
+    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
+
+    m_colStride = patch_rows;
+
+    m_outputRows = tensor.impl().outputRows();
+    m_row_strides = tensor.impl().userRowStride();
+    m_col_strides = tensor.impl().userColStride();
+
+    m_in_row_strides = tensor.impl().userInRowStride();
+    m_in_col_strides = tensor.impl().userInColStride();
+
+    if (internal::traits<ArgType>::Layout == ColMajor) {
+      m_inputRows = tensor.impl().impl().dimensions()[1];
+      m_inputCols = tensor.impl().impl().dimensions()[2];
+    } else {
+      const int NumDims = tensor.impl().impl().dimensions().size();
+      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
+      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
+    }
+
+    m_rowInputStride = patch_depth;
+    m_colInputStride = patch_depth * m_inputRows;
+    m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
+
+    m_rowPaddingTop = tensor.impl().rowPaddingTop();
+    m_colPaddingLeft = tensor.impl().colPaddingLeft();
+
+    m_fastPatchRowStride =
+        internal::TensorIntDivisor<Index>(m_patch_row_stride);
+    m_fastPatchColStride =
+        internal::TensorIntDivisor<Index>(m_patch_col_stride);
+    m_fastInputRowStride =
+        internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
+    m_fastInputColStride =
+        internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
+    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
+  }
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper)
+      : m_impl(base_mapper.m_impl) {
+    m_patch_cols = base_mapper.m_patch_cols;
+    m_num_patches = base_mapper.m_num_patches;
+
+    m_patch_row_stride = base_mapper.m_patch_row_stride;
+    m_patch_col_stride = base_mapper.m_patch_col_stride;
+
+    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
+    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
+
+    m_colStride = base_mapper.m_colStride;
+
+    m_rowInputStride = base_mapper.m_rowInputStride;
+    m_colInputStride = base_mapper.m_colInputStride;
+    m_patchInputStride = base_mapper.m_patchInputStride;
+
+    m_inputRows = base_mapper.m_inputRows;
+    m_inputCols = base_mapper.m_inputCols;
+
+    m_outputRows = base_mapper.m_outputRows;
+    m_row_strides = base_mapper.m_row_strides;
+    m_col_strides = base_mapper.m_col_strides;
+
+    m_in_row_strides = base_mapper.m_in_row_strides;
+    m_in_col_strides = base_mapper.m_in_col_strides;
+
+    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
+    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
+
+    m_fastPatchRowStride = base_mapper.m_fastPatchRowStride;
+    m_fastPatchColStride = base_mapper.m_fastPatchColStride;
+    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
+    m_fastInputColStride = base_mapper.m_fastInputColStride;
+    m_fastNumPatches = base_mapper.m_fastNumPatches;
+    m_fastColStride = base_mapper.m_fastColStride;
+    m_fastOutputRows = base_mapper.m_fastOutputRows;
+    m_fastDimZero = base_mapper.m_fastDimZero;
+  }
+
+  // If true, turns off some optimizations for loading packets since the image
+  // patches are "non-standard" such as there are non-trivial strides or
+  // inflations in the input.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_in_row_strides != 1 || m_in_col_strides != 1 ||
+           m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the coefficient at the patchIndex location instead of the usual
+  // m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  // EIGEN_DEVICE_FUNC
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadCoeff(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
+  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
+    Index rowIndex, colIndex, otherIndex;
+    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
+    return loadPacket(row, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
+
+ private:
+  friend class TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>;
+
+  // Load coefficient from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex,
+                                       Index colIndex, Index otherIndex) const {
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex + colOffset * m_in_col_strides;
+    const Index origInputCol =
+        (m_patch_col_inflate_strides == 1)
+            ? inputCol
+            : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
+    const Index origInputRow =
+        (m_patch_row_inflate_strides == 1)
+            ? inputRow
+            : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
+        origInputRow >= m_inputRows ||
+        (inputCol != origInputCol * m_patch_col_inflate_strides) ||
+        (inputRow != origInputRow * m_patch_row_inflate_strides)) {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + origInputRow * m_rowInputStride +
+                             origInputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  // This is the same as loadCoeff(...), but optimized for all `inflate_strides`
+  // and `in_strides` equal to 1 (template specialization without templates).
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex,
+                                               Index colIndex,
+                                               Index otherIndex) const {
+    eigen_assert(!nonStandardPatches());
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 ||
+        inputRow >= m_inputRows) {
+      return Scalar(0);
+    }
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputRow * m_rowInputStride +
+                             inputCol * m_colInputStride + otherIndex;
+    return m_impl.coeff(inputIndex);
+  }
+
+  // Load packet from a patch specified by the "within patch offset"
+  // (patchId) and the precomputed indices of the first element of the patch.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex,
+                                        Index colIndex,
+                                        Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    if (nonStandardPatches()) {
+      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+    }
+    return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex,
+                                                Index colIndex,
+                                                Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+
+    if ((patchDepth() % packetSize) == 0) {
+      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
+    } else {
+      // Offsets and input calculation here are identical to
+      // loadCoeffStandard(...), but repeated twice.
+
+      const Index patchOffsets[2] = {
+          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
+
+      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
+                                   patchOffsets[1] / m_fastColStride};
+      const Index inputCols[2] = {colIndex + colOffsets[0],
+                                  colIndex + colOffsets[1]};
+      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
+        // all zeros
+        return internal::pset1<Packet>(Scalar(0));
+      }
+
+      if (inputCols[0] == inputCols[1]) {
+        const Index rowOffsets[2] = {
+            patchOffsets[0] - colOffsets[0] * m_colStride,
+            patchOffsets[1] - colOffsets[1] * m_colStride};
+        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+        const Index inputRows[2] = {rowIndex + rowOffsets[0],
+                                    rowIndex + rowOffsets[1]};
+
+        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
+          // all zeros
+          return internal::pset1<Packet>(Scalar(0));
+        }
+
+        if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+          // no padding
+          const Index depth = patchId - patchOffsets[0] * patchDepth();
+          const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
+                                   inputCols[0] * m_colInputStride + otherIndex;
+          return m_impl.template packet<Unaligned>(inputIndex);
+        }
+      }
+    }
+    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex,
+                                            Index colIndex,
+                                            Index otherIndex) const {
+    const Index packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
+
+    eigen_assert(!nonStandardPatches());
+    eigen_assert((patchDepth() % packetSize) == 0);
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = patchId / m_fastDimZero;
+    eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset);
+
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputCol = colIndex + colOffset;
+    const Index inputRow = rowIndex + rowOffset;
+    if (inputCol < 0 || inputRow < 0 || inputCol >= m_inputCols ||
+        inputRow >= m_inputRows) {
+      // all zeros
+      return internal::pset1<Packet>(Scalar(0));
+    }
+    // no padding
+    const Index depth = patchId - patchOffset * patchDepth();
+    const Index inputIndex = depth + inputRow * m_rowInputStride +
+                             inputCol * m_colInputStride + otherIndex;
+    return m_impl.template packet<Unaligned>(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(
+      Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX
+    typename internal::remove_const<Scalar>::type values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = loadCoeff(patchId + i, rowIndex, colIndex, otherIndex);
+    }
+    Packet rslt = internal::pload<Packet>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(
+      Index patchIndex, Index& rowIndex, Index& colIndex,
+      Index& otherIndex) const {
+    const size_t NumInputDims = array_size<
+        typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
+    const Index patch2DIndex = (NumInputDims == 3)
+                                   ? patchIndex
+                                   : (patchIndex - otherIndex * m_num_patches);
+    otherIndex *= m_patchInputStride;
+    colIndex = patch2DIndex / m_fastOutputRows;
+    rowIndex = patch2DIndex - colIndex * m_outputRows;
+    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
+    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
+  }
+
+  Index m_patch_cols;   // number of columns in the patch
+  Index m_num_patches;  // number of patches to extract.
+
+  // Strides for navigating through the single patch.
+  Index m_patch_row_stride;
+  Index m_patch_col_stride;
+  internal::TensorIntDivisor<Index> m_fastPatchRowStride;
+  internal::TensorIntDivisor<Index> m_fastPatchColStride;
+
+  Index m_patch_row_inflate_strides;  // the strides for row inflation in the
+                                      // image patch
+  Index m_patch_col_inflate_strides;  // the strides for col inflation in the
+                                      // image patch
+  // Fast representation of inflation strides.
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+
+  Index m_otherStride;
+  Index m_colStride;
+  internal::TensorIntDivisor<Index> m_fastNumPatches;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+
+  Index m_rowInputStride;    // row stride in the input tensor
+  Index m_colInputStride;    // col stride in the input tensor
+  Index m_patchInputStride;  // patch stride in the input tensor
+
+  Index m_inputRows;  // Number of rows in the input tensor
+  Index m_inputCols;  // Number of cols in the input tensor
+
+  Index m_outputRows;  // Number of patch rows
+
+  Index m_row_strides;  // User specified row stride
+  Index m_col_strides;  // User specified col stride
+
+  Index m_in_row_strides;  // User specified input row stride
+  Index m_in_col_strides;  // User specified input col stride
+
+  Index m_rowPaddingTop;   // Row padding
+  Index m_colPaddingLeft;  // Column padding
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastDimZero;
+
+  const TensorEvaluator<ArgType, Device> m_impl;
+};
+
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper<
+    Scalar, Index, Side,
+    TensorEvaluator<
+        const TensorReshapingOp<NewDimension,
+                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
+        Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+    inner_dim_reordered, Alignment> {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      ParentMapper;
+
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Side,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      Self;
+
+  typedef Self LinearMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
+      const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_depth_offset(vert_offset),
+        m_col_offset(horiz_offset),
+        m_base_mapper(base_mapper) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
+                                     m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
+      const Self& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+        m_col_offset(horiz_offset + base_mapper.m_col_offset),
+        m_base_mapper(base_mapper.m_base_mapper) {
+    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
+                                     m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex,
+                                   m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i,
+                                                          Index j) const {
+    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex,
+                                    m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i,
+                                                          Index j) const {
+    return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
+                                                        j + m_col_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
+  loadCoeffStandard(Index i) const {
+    return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex,
+                                           m_colIndex, m_otherIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
+    return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex,
+                                        m_colIndex, m_otherIndex);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
+  loadPacketStandard(Index i) const {
+    return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex,
+                                            m_colIndex, m_otherIndex);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index) const {
+    return false;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
+    return m_base_mapper.nonStandardPatches();
+  }
+
+  // Max(Col|Row|Depth): compute the upper limit for the column, row and depth
+  // index respectively that fits into the peeled_k elements starting at
+  // m_depth_offset.
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
+    const Index max_col =
+        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) /
+        fastPatchColStride();
+    return std::min<Index>(1 + max_col, patchCols());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
+                                   const Index col) const {
+    const Index max_row = (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) -
+                           col * patchColStride()) /
+                          fastPatchRowStride();
+    return std::min<Index>(1 + max_row, patchRows());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col,
+                                     Index row) const {
+    const Index max_depth = m_depth_offset + peeled_k -  //
+                            col * patchColStride() -     //
+                            row * patchRowStride();
+    return std::min<Index>(max_depth, patchDepth());
+  }
+
+  // MaxDepth uses only the remaining number of elements in the peeled_k.
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
+                                     const Index start_depth) const {
+    return std::min<Index>(start_depth + num_elements, patchDepth());
+  }
+
+  // Every register matters in this code, so sometimes to prevent register
+  // spilling, instead of the variable that you would expect to see, we use
+  // another one, that is guaranteed to have the same value. E.g. patch depth is
+  // always the same as input depth, and it's also the same as input row stride.
+  // Bunch of other parameters have similar relations.
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchDepth() const {
+    return m_base_mapper.m_rowInputStride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRows() const {
+    return m_base_mapper.m_colStride;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchCols() const {
+    return m_base_mapper.m_patch_cols;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchRowStride() const {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
+                 "Patch depth must be equal to patch row stride.");
+    return patchDepth();
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index patchColStride() const {
+    return m_base_mapper.m_patch_col_stride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const {
+    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
+                 "Patch depth must be equal to patch row stride.");
+    return m_base_mapper.m_fastDimZero;  // patch_depth
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const {
+    return m_base_mapper.m_fastPatchColStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
+                                             const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
+    const Index r = m_rowIndex + row;
+    return r < 0 || r >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row,
+                                     const Index last_row) const {
+    return m_rowIndex + first_row < 0 ||
+           m_rowIndex + last_row >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
+    const Index c = m_colIndex + col;
+    return c < 0 || c >= m_base_mapper.m_inputCols;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
+    const Index r = m_rowIndex + row;
+    const Index c = m_colIndex + col;
+    return r * m_base_mapper.m_rowInputStride +
+           c * m_base_mapper.m_colInputStride + m_otherIndex;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowStride() const {
+    return m_base_mapper.m_row_strides;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colStride() const {
+    return m_base_mapper.m_col_strides;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return patchOffset - colOffset * m_base_mapper.m_colStride;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colOffset() const {
+    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
+    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
+    return colOffset;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index depthOffset() const {
+    return m_depth_offset % patchDepth();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper
+  getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
+  }
+
+ private:
+  Index m_depth_offset;  // First row in the input matrix
+  Index m_col_offset;    // First col in the input matrix
+
+  // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
+  // indices for the first element in a patch specified by col_offset
+  // (see computeBaseIndices(...) for details).
+  Index m_rowIndex;
+  Index m_colIndex;
+  Index m_otherIndex;
+
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
+};
+
+// Arrange a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted image patches) in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0  E0 F0 G0 H0 ... Z0
+// A1 B1 C1 D1  E1 F1 G1 H1 ... Z1
+// A2 B2 C2 D2  E2 F2 G2 H2 ... Z2
+// A3 B3 C3 D3  E3 F3 G3 H3 ... Z3
+// A4 B4 C4 D4  E4 F4 G4 H4 ... Z4
+// A5 B5 C5 D5  E5 F5 G5 H5 ... Z5
+// A6 B6 C6 D6  E6 F6 G6 H6 ... Z6
+// A7 B7 C7 D7  E7 F7 G7 H7 ... Z7
+// A8 ...
+// ...
+//
+// *) A, B, C, ... - patches extracted from the original input.
+// *) A0, A1, A2 ... - values from the same patch at different offsets.
+//
+// The traversal (packed rhs memory) order (B0 besides A0 in memory):
+// A0 B0 C0 D0 A1 B1 C1 D1 ...
+// E0 F0 G0 H0 E1 F1 G1 H1 ...
+// ...
+// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4)
+//
+// This traversal order must be the same as in default gemm_pack_rhs defined in
+// GeneralBlockPanelKernel.h.
+//
+// *) nr - number of registers along the 'n' dimension.
+//    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
+//    Multiplication" paper.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if ((packet_size % 4) == 0 && !non_standard_patches) {
+        // FAST PATH:
+        // Iterate over patch columns and rows, if we know that a single
+        // packet do not span across multiple rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0) {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            // Check if we can squeeze reads along the `row` and `depth`
+            // dimensions (two innermost dimensions).
+            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
+            for (Index r = start_row; r < max_row; ++r) {
+              eigen_assert(k <= peeled_k);
+
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index start_depth = ((c == start_col) && (r == start_row))
+                                            ? rhs.depthOffset()
+                                            : 0;
+              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+
+        } else {
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 4> kernel;
+            kernel.packet[0] = dm0.loadPacketStandard(k);
+            kernel.packet[1] = dm1.loadPacketStandard(k);
+            kernel.packet[2] = dm2.loadPacketStandard(k);
+            kernel.packet[3] = dm3.loadPacketStandard(k);
+            ptranspose(kernel);
+            pstoreu(block + 0 * packet_size, kernel.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel.packet[1]);
+            pstoreu(block + 2 * packet_size, kernel.packet[2]);
+            pstoreu(block + 3 * packet_size, kernel.packet[3]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!rhs.nonStandardPatches()) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Template specialization for packet_size = 2. We must special-case packet
+// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+        Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
+      Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const int packet_size = 2;
+    const Index packet_cols4 = (cols / 4) * 4;
+    const Index peeled_k = (depth / packet_size) * packet_size;
+    const bool non_standard_patches = rhs.nonStandardPatches();
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if (!non_standard_patches) {
+        // FAST PATH:
+        // Iterate over patch columns and rows if we know that a single
+        // packet do not span across multiple rows or columns.
+        if ((rhs.patchDepth() % packet_size) == 0) {
+          const Index start_col = rhs.colOffset();
+          const Index max_col = rhs.maxCol(peeled_k);
+
+          for (Index c = start_col; c < max_col; ++c) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+            const Index max_row = rhs.maxRow(peeled_k, c);
+
+            const bool pad_col0 = dm0.padCol(c);
+            const bool pad_col1 = dm1.padCol(c);
+            const bool pad_col2 = dm2.padCol(c);
+            const bool pad_col3 = dm3.padCol(c);
+
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                                //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
+            for (Index r = start_row; r < max_row; ++r) {
+              eigen_assert(k <= peeled_k);
+
+              const bool pad0 = pad_col0 || dm0.padRow(r);
+              const bool pad1 = pad_col1 || dm1.padRow(r);
+              const bool pad2 = pad_col2 || dm2.padRow(r);
+              const bool pad3 = pad_col3 || dm3.padRow(r);
+
+              const Index idx0 = dm0.baseIndex(r, c);
+              const Index idx1 = dm1.baseIndex(r, c);
+              const Index idx2 = dm2.baseIndex(r, c);
+              const Index idx3 = dm3.baseIndex(r, c);
+
+              const Index start_depth = ((c == start_col) && (r == start_row))
+                                            ? rhs.depthOffset()
+                                            : 0;
+              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
+                                         : rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+            }
+          }
+
+          // The loop above should fill peeled_k elements.
+          eigen_assert(peeled_k == k);
+
+        } else {
+          // Packet can span multiple rows or columns, so we have to go
+          // though the slower "standard" path.
+          for (; k < peeled_k; k += packet_size) {
+            PacketBlock<Packet, 2> kernel0;
+            PacketBlock<Packet, 2> kernel1;
+            kernel0.packet[0] = dm0.loadPacketStandard(k);
+            kernel0.packet[1] = dm1.loadPacketStandard(k);
+            kernel1.packet[0] = dm2.loadPacketStandard(k);
+            kernel1.packet[1] = dm3.loadPacketStandard(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+            block += 4 * packet_size;
+          }
+        }
+      }
+
+      // Copy the remaining coefficients of the column block after the peeled_k.
+      if (!non_standard_patches) {
+        for (; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+
+// Special case for non-vectorized types such as float16.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
+struct gemm_pack_rhs<
+    Scalar, Index,
+    TensorContractionSubMapper<
+        Scalar, Index, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
+        Alignment>,
+    nr, ColMajor, false, false> {
+  typedef TensorContractionSubMapper<
+      Scalar, Index, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
+      Alignment>
+      SubMapper;
+  typedef SubMapper DataMapper;
+
+  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) const {
+    eigen_assert(stride == 0);
+    eigen_assert(offset == 0);
+
+    const Index packet_cols4 = (cols / 4) * 4;
+
+    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      if (!rhs.nonStandardPatches()) {
+        for (Index k = 0; k < depth; k++) {
+          block[0] = dm0.loadCoeffStandard(k);
+          block[1] = dm1.loadCoeffStandard(k);
+          block[2] = dm2.loadCoeffStandard(k);
+          block[3] = dm3.loadCoeffStandard(k);
+          block += 4;
+        }
+      } else {
+        for (Index k = 0; k < depth; k++) {
+          block[0] = dm0(k);
+          block[1] = dm1(k);
+          block[2] = dm2(k);
+          block[3] = dm3(k);
+          block += 4;
+        }
+      }
+    }
+
+    // Copy the remaining columns one at a time (nr==1).
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
+      for (Index k = 0; k < depth; k++) {
+        *block = dm0(k);
+        block += 1;
+      }
+    }
+  }
+};
+}  // end namespace internal
+
+/** SpatialConvolution
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
+ * (channels, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_height, kernel_width)
+ * The input and the kernel must both be in col-major layout. The result will
+ * also be in col-major layout.
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
+ * pixels.
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be filters, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the width and height dimensions provided
+ * that the same order is used in the input, the kernel, and the output.
+ *
+ * It is also possible to add an output kernel to the contraction, output
+ * kernel is called by Eigen when it "finalizes" the block of an output tensor.
+ *
+ */
+template <typename Input, typename Kernel,
+          typename OutputKernel = const NoOpOutputKernel>
+EIGEN_DEVICE_FUNC
+    EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+        internal::traits<Input>::Layout == ColMajor,
+        TensorReshapingOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorContractionOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            1>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+                const OutputKernel> >,
+        TensorReshapingOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorContractionOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            1>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel>,
+                const OutputKernel> > >::type
+    SpatialConvolution(const Input& input, const Kernel& kernel,
+                       const Index row_stride = 1, const Index col_stride = 1,
+                       const PaddingType padding_type = PADDING_SAME,
+                       const Index row_in_stride = 1,
+                       const Index col_in_stride = 1,
+                       const OutputKernel& output_kernel = OutputKernel()) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex> >
+      in(input);
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
+                   internal::traits<Kernel>::NumDimensions,
+                   internal::traits<Kernel>::Layout, TensorIndex> >
+      kern(kernel);
+
+  EIGEN_STATIC_ASSERT(
+      internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
+      YOU_MADE_A_PROGRAMMING_MISTAKE)
+  const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the
+  // result
+  const TensorIndex kernelFilters =
+      isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels =
+      isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows =
+      isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols =
+      isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  const Index kernelRowsEff =
+      kernelRows + (kernelRows - 1) * (row_in_stride - 1);
+  const Index kernelColsEff =
+      kernelCols + (kernelCols - 1) * (col_in_stride - 1);
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  const TensorIndex InputRows =
+      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex InputCols =
+      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type) {
+    case PADDING_VALID:
+      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) /
+                                static_cast<float>(row_stride));
+      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) /
+                               static_cast<float>(col_stride));
+      break;
+    case PADDING_SAME:
+      out_height = numext::ceil(InputRows / static_cast<float>(row_stride));
+      out_width = numext::ceil(InputCols / static_cast<float>(col_stride));
+      break;
+    default:
+      // Initialize unused variables to avoid a compiler warning
+      out_height = 0;
+      out_width = 0;
+      eigen_assert(false && "unexpected padding");
+  }
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  // kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_height * out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_height * out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  // Molds the output of the contraction into the shape expected by the used
+  // (assuming this is ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_height;
+    post_contract_dims[2] = out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_height;
+    post_contract_dims[NumDims - 3] = out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+  return choose(
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      kernel.reshape(kernel_dims)
+          .contract(input
+                        .extract_image_patches(
+                            kernelRows, kernelCols, row_stride, col_stride,
+                            row_in_stride, col_in_stride, padding_type)
+                        .reshape(pre_contract_dims),
+                    contract_dims, output_kernel)
+          .reshape(post_contract_dims),
+      input
+          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                 row_in_stride, col_in_stride, padding_type)
+          .reshape(pre_contract_dims)
+          .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+          .reshape(post_contract_dims));
+}
+
+}  // end namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index ca5f4b2..f955bc7 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -20,1288 +20,9 @@
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
-#endif
 
 namespace Eigen {
-
 namespace internal {
-
-// WARNING: Most of the code here implicitly assumes that the matrix is in
-// ColMajor layout. This is guaranteed by the tensor contraction (see
-// TensorContraction.h).
-//
-// Inside Eigen a tensor contraction is represented by a matrix multiplication.
-// We don't want to actually extract image patches and reshape the result into
-// a matrix (this involves allocating huge extra memory), so the patch
-// extraction and reshape operations are implicit.
-//
-// TensorContractionInputMapper takes a matrix index and returns the coefficient
-// (or the packet) of the "virtual tensor", that would be at that index if we
-// were to actually reshape the result of patch extraction.
-//
-// TensorContractionSubMapper provides a similar view into the "virtual matrix"
-// at the given vertical and horizontal offsets.
-//
-// "Virtual matrix" dimensions:
-//   *0: kernelChannels * kernelRows * kernelCols;
-//    1: out_height * out_width; * OTHERS (e.g batches, etc...)
-//
-// *) extracted patches are continuous in memory (innermost dimension assuming
-//    col major layout)
-//
-// With this dimensions:
-//   row - offset within a single patch (in code: patchId)
-//   col - index of the extracted patch (in code: patchIndex)
-//         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
-//
-// TODO(ezhulenev): Consolidate this part of the code with the image patch
-// extraction code since they are both very similar.
-
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar_, typename Index,
-          typename nocontract_t, typename contract_t, int Side, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper<
-    Scalar_, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension,
-                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-    inner_dim_reordered, Alignment> {
- public:
-  typedef Scalar_ Scalar;
-
-  typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      Self;
-
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      SubMapper;
-
-  typedef SubMapper VectorMapper;
-  typedef SubMapper LinearMapper;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  EIGEN_DEVICE_FUNC
-  TensorContractionInputMapper(
-      const TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>& tensor,
-      const nocontract_t&, const nocontract_t&, const contract_t&,
-      const contract_t&)
-      : m_impl(tensor.impl().impl()) {
-    Index patch_rows;
-    Index patch_depth;
-    if (internal::traits<ArgType>::Layout == ColMajor) {
-      patch_depth = tensor.impl().dimensions()[0];
-      patch_rows = tensor.impl().dimensions()[1];
-      m_patch_cols = tensor.impl().dimensions()[2];
-      m_num_patches = tensor.impl().dimensions()[3];
-    } else {
-      const size_t NumDims = tensor.impl().dimensions().size();
-      patch_depth = tensor.impl().dimensions()[NumDims - 1];
-      patch_rows = tensor.impl().dimensions()[NumDims - 2];
-      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
-      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
-    }
-
-    // Strides for navigating through the single patch.
-    m_patch_row_stride = patch_depth;
-    m_patch_col_stride = patch_rows * m_patch_row_stride;
-
-    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
-    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
-
-    m_colStride = patch_rows;
-
-    m_outputRows = tensor.impl().outputRows();
-    m_row_strides = tensor.impl().userRowStride();
-    m_col_strides = tensor.impl().userColStride();
-
-    m_in_row_strides = tensor.impl().userInRowStride();
-    m_in_col_strides = tensor.impl().userInColStride();
-
-    if (internal::traits<ArgType>::Layout == ColMajor) {
-      m_inputRows = tensor.impl().impl().dimensions()[1];
-      m_inputCols = tensor.impl().impl().dimensions()[2];
-    } else {
-      const int NumDims = tensor.impl().impl().dimensions().size();
-      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
-      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
-    }
-
-    m_rowInputStride = patch_depth;
-    m_colInputStride = patch_depth * m_inputRows;
-    m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
-
-    m_rowPaddingTop = tensor.impl().rowPaddingTop();
-    m_colPaddingLeft = tensor.impl().colPaddingLeft();
-
-    m_fastPatchRowStride =
-        internal::TensorIntDivisor<Index>(m_patch_row_stride);
-    m_fastPatchColStride =
-        internal::TensorIntDivisor<Index>(m_patch_col_stride);
-    m_fastInputRowStride =
-        internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
-    m_fastInputColStride =
-        internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
-    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
-    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
-    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
-    m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
-  }
-
-  EIGEN_DEVICE_FUNC
-  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper)
-      : m_impl(base_mapper.m_impl) {
-    m_patch_cols = base_mapper.m_patch_cols;
-    m_num_patches = base_mapper.m_num_patches;
-
-    m_patch_row_stride = base_mapper.m_patch_row_stride;
-    m_patch_col_stride = base_mapper.m_patch_col_stride;
-
-    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
-    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
-
-    m_colStride = base_mapper.m_colStride;
-
-    m_rowInputStride = base_mapper.m_rowInputStride;
-    m_colInputStride = base_mapper.m_colInputStride;
-    m_patchInputStride = base_mapper.m_patchInputStride;
-
-    m_inputRows = base_mapper.m_inputRows;
-    m_inputCols = base_mapper.m_inputCols;
-
-    m_outputRows = base_mapper.m_outputRows;
-    m_row_strides = base_mapper.m_row_strides;
-    m_col_strides = base_mapper.m_col_strides;
-
-    m_in_row_strides = base_mapper.m_in_row_strides;
-    m_in_col_strides = base_mapper.m_in_col_strides;
-
-    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
-    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
-
-    m_fastPatchRowStride = base_mapper.m_fastPatchRowStride;
-    m_fastPatchColStride = base_mapper.m_fastPatchColStride;
-    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
-    m_fastInputColStride = base_mapper.m_fastInputColStride;
-    m_fastNumPatches = base_mapper.m_fastNumPatches;
-    m_fastColStride = base_mapper.m_fastColStride;
-    m_fastOutputRows = base_mapper.m_fastOutputRows;
-    m_fastDimZero = base_mapper.m_fastDimZero;
-  }
-
-  // If true, turns off some optimizations for loading packets since the image
-  // patches are "non-standard" such as there are non-trivial strides or
-  // inflations in the input.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
-    return m_in_row_strides != 1 || m_in_col_strides != 1 ||
-           m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
-    return SubMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
-    return LinearMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
-    return loadCoeff(row, rowIndex, colIndex, otherIndex);
-  }
-
-  // Load the coefficient at the patchIndex location instead of the usual
-  // m_rowIndex,
-  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
-  // EIGEN_DEVICE_FUNC
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
-    return loadCoeff(row, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
-    return loadPacket(row, rowIndex, colIndex, otherIndex);
-  }
-
-  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
-  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
-    return loadPacket(row, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
-    return m_impl;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
-
- private:
-  friend class TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>;
-
-  // Load coefficient from a patch specified by the "within patch offset"
-  // (patchId) and the precomputed indices of the first element of the patch.
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex,
-                                       Index colIndex, Index otherIndex) const {
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index inputCol = colIndex + colOffset * m_in_col_strides;
-    const Index origInputCol =
-        (m_patch_col_inflate_strides == 1)
-            ? inputCol
-            : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
-
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
-    const Index origInputRow =
-        (m_patch_row_inflate_strides == 1)
-            ? inputRow
-            : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
-    if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
-        origInputRow >= m_inputRows ||
-        (inputCol != origInputCol * m_patch_col_inflate_strides) ||
-        (inputRow != origInputRow * m_patch_row_inflate_strides)) {
-      return Scalar(0);
-    }
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + origInputRow * m_rowInputStride +
-                             origInputCol * m_colInputStride + otherIndex;
-    return m_impl.coeff(inputIndex);
-  }
-
-  // This is the same as loadCoeff(...), but optimized for all `inflate_strides`
-  // and `in_strides` equal to 1 (template specialization without templates).
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex,
-                                               Index colIndex,
-                                               Index otherIndex) const {
-    eigen_assert(!nonStandardPatches());
-
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputCol = colIndex + colOffset;
-    const Index inputRow = rowIndex + rowOffset;
-    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 ||
-        inputRow >= m_inputRows) {
-      return Scalar(0);
-    }
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + inputRow * m_rowInputStride +
-                             inputCol * m_colInputStride + otherIndex;
-    return m_impl.coeff(inputIndex);
-  }
-
-  // Load packet from a patch specified by the "within patch offset"
-  // (patchId) and the precomputed indices of the first element of the patch.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex,
-                                        Index colIndex,
-                                        Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
-
-    if (nonStandardPatches()) {
-      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
-    }
-    return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex,
-                                                Index colIndex,
-                                                Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
-
-    eigen_assert(!nonStandardPatches());
-
-    if ((patchDepth() % packetSize) == 0) {
-      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
-    } else {
-      // Offsets and input calculation here are identical to
-      // loadCoeffStandard(...), but repeated twice.
-
-      const Index patchOffsets[2] = {
-          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
-
-      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
-                                   patchOffsets[1] / m_fastColStride};
-      const Index inputCols[2] = {colIndex + colOffsets[0],
-                                  colIndex + colOffsets[1]};
-      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
-        // all zeros
-        return internal::pset1<Packet>(Scalar(0));
-      }
-
-      if (inputCols[0] == inputCols[1]) {
-        const Index rowOffsets[2] = {
-            patchOffsets[0] - colOffsets[0] * m_colStride,
-            patchOffsets[1] - colOffsets[1] * m_colStride};
-        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
-        const Index inputRows[2] = {rowIndex + rowOffsets[0],
-                                    rowIndex + rowOffsets[1]};
-
-        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
-          // all zeros
-          return internal::pset1<Packet>(Scalar(0));
-        }
-
-        if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
-          // no padding
-          const Index depth = patchId - patchOffsets[0] * patchDepth();
-          const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
-                                   inputCols[0] * m_colInputStride + otherIndex;
-          return m_impl.template packet<Unaligned>(inputIndex);
-        }
-      }
-    }
-    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex,
-                                            Index colIndex,
-                                            Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
-
-    eigen_assert(!nonStandardPatches());
-    eigen_assert((patchDepth() % packetSize) == 0);
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-    eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset);
-
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputCol = colIndex + colOffset;
-    const Index inputRow = rowIndex + rowOffset;
-    if (inputCol < 0 || inputRow < 0 || inputCol >= m_inputCols ||
-        inputRow >= m_inputRows) {
-      // all zeros
-      return internal::pset1<Packet>(Scalar(0));
-    }
-    // no padding
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + inputRow * m_rowInputStride +
-                             inputCol * m_colInputStride + otherIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(
-      Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
-    const int packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_MAX
-    typename internal::remove_const<Scalar>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = loadCoeff(patchId + i, rowIndex, colIndex, otherIndex);
-    }
-    Packet rslt = internal::pload<Packet>(values);
-    return rslt;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(
-      Index patchIndex, Index& rowIndex, Index& colIndex,
-      Index& otherIndex) const {
-    const size_t NumInputDims = array_size<
-        typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
-    const Index patch2DIndex = (NumInputDims == 3)
-                                   ? patchIndex
-                                   : (patchIndex - otherIndex * m_num_patches);
-    otherIndex *= m_patchInputStride;
-    colIndex = patch2DIndex / m_fastOutputRows;
-    rowIndex = patch2DIndex - colIndex * m_outputRows;
-    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
-    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
-  }
-
-  Index m_patch_cols;   // number of columns in the patch
-  Index m_num_patches;  // number of patches to extract.
-
-  // Strides for navigating through the single patch.
-  Index m_patch_row_stride;
-  Index m_patch_col_stride;
-  internal::TensorIntDivisor<Index> m_fastPatchRowStride;
-  internal::TensorIntDivisor<Index> m_fastPatchColStride;
-
-  Index m_patch_row_inflate_strides;  // the strides for row inflation in the
-                                      // image patch
-  Index m_patch_col_inflate_strides;  // the strides for col inflation in the
-                                      // image patch
-  // Fast representation of inflation strides.
-  internal::TensorIntDivisor<Index> m_fastInputRowStride;
-  internal::TensorIntDivisor<Index> m_fastInputColStride;
-
-  Index m_otherStride;
-  Index m_colStride;
-  internal::TensorIntDivisor<Index> m_fastNumPatches;
-  internal::TensorIntDivisor<Index> m_fastColStride;
-
-  Index m_rowInputStride;    // row stride in the input tensor
-  Index m_colInputStride;    // col stride in the input tensor
-  Index m_patchInputStride;  // patch stride in the input tensor
-
-  Index m_inputRows;  // Number of rows in the input tensor
-  Index m_inputCols;  // Number of cols in the input tensor
-
-  Index m_outputRows;  // Number of patch rows
-
-  Index m_row_strides;  // User specified row stride
-  Index m_col_strides;  // User specified col stride
-
-  Index m_in_row_strides;  // User specified input row stride
-  Index m_in_col_strides;  // User specified input col stride
-
-  Index m_rowPaddingTop;   // Row padding
-  Index m_colPaddingLeft;  // Column padding
-
-  internal::TensorIntDivisor<Index> m_fastOutputRows;
-  internal::TensorIntDivisor<Index> m_fastDimZero;
-
-  const TensorEvaluator<ArgType, Device> m_impl;
-};
-
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, int Side, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionSubMapper<
-    Scalar, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension,
-                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-    inner_dim_reordered, Alignment> {
- public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
-  typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      ParentMapper;
-
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      Self;
-
-  typedef Self LinearMapper;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
-      const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_depth_offset(vert_offset),
-        m_col_offset(horiz_offset),
-        m_base_mapper(base_mapper) {
-    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
-                                     m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
-      const Self& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
-        m_col_offset(horiz_offset + base_mapper.m_col_offset),
-        m_base_mapper(base_mapper.m_base_mapper) {
-    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
-                                     m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
-    return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex,
-                                   m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i,
-                                                          Index j) const {
-    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex,
-                                    m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i,
-                                                          Index j) const {
-    return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
-                                                        j + m_col_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
-  loadCoeffStandard(Index i) const {
-    return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex,
-                                           m_colIndex, m_otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
-    return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex,
-                                        m_colIndex, m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
-  loadPacketStandard(Index i) const {
-    return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex,
-                                            m_colIndex, m_otherIndex);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC bool aligned(Index) const {
-    return false;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
-    return m_base_mapper.nonStandardPatches();
-  }
-
-  // Max(Col|Row|Depth): compute the upper limit for the column, row and depth
-  // index respectively that fits into the peeled_k elements starting at
-  // m_depth_offset.
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
-    const Index max_col =
-        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) /
-        fastPatchColStride();
-    return std::min<Index>(1 + max_col, patchCols());
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
-                                   const Index col) const {
-    const Index max_row = (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) -
-                           col * patchColStride()) /
-                          fastPatchRowStride();
-    return std::min<Index>(1 + max_row, patchRows());
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col,
-                                     Index row) const {
-    const Index max_depth = m_depth_offset + peeled_k -  //
-                            col * patchColStride() -     //
-                            row * patchRowStride();
-    return std::min<Index>(max_depth, patchDepth());
-  }
-
-  // MaxDepth uses only the remaining number of elements in the peeled_k.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
-                                     const Index start_depth) const {
-    return std::min<Index>(start_depth + num_elements, patchDepth());
-  }
-
-  // Every register matters in this code, so sometimes to prevent register
-  // spilling, instead of the variable that you would expect to see, we use
-  // another one, that is guaranteed to have the same value. E.g. patch depth is
-  // always the same as input depth, and it's also the same as input row stride.
-  // Bunch of other parameters have similar relations.
-
-  typedef internal::TensorIntDivisor<Index> IndexDivisor;
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchDepth() const {
-    return m_base_mapper.m_rowInputStride;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRows() const {
-    return m_base_mapper.m_colStride;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchCols() const {
-    return m_base_mapper.m_patch_cols;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRowStride() const {
-    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
-                 "Patch depth must be equal to patch row stride.");
-    return patchDepth();
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchColStride() const {
-    return m_base_mapper.m_patch_col_stride;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const {
-    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
-                 "Patch depth must be equal to patch row stride.");
-    return m_base_mapper.m_fastDimZero;  // patch_depth
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const {
-    return m_base_mapper.m_fastPatchColStride;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
-                                             const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
-                                            const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_base_mapper.m_impl.coeff(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
-    const Index r = m_rowIndex + row;
-    return r < 0 || r >= m_base_mapper.m_inputRows;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row,
-                                     const Index last_row) const {
-    return m_rowIndex + first_row < 0 ||
-           m_rowIndex + last_row >= m_base_mapper.m_inputRows;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
-    const Index c = m_colIndex + col;
-    return c < 0 || c >= m_base_mapper.m_inputCols;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
-    const Index r = m_rowIndex + row;
-    const Index c = m_colIndex + col;
-    return r * m_base_mapper.m_rowInputStride +
-           c * m_base_mapper.m_colInputStride + m_otherIndex;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index rowStride() const {
-    return m_base_mapper.m_row_strides;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index colStride() const {
-    return m_base_mapper.m_col_strides;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index rowOffset() const {
-    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
-    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
-    return patchOffset - colOffset * m_base_mapper.m_colStride;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index colOffset() const {
-    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
-    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
-    return colOffset;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index depthOffset() const {
-    return m_depth_offset % patchDepth();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper
-  getLinearMapper(Index i, Index j) const {
-    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
-  }
-
- private:
-  Index m_depth_offset;  // First row in the input matrix
-  Index m_col_offset;    // First col in the input matrix
-
-  // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
-  // indices for the first element in a patch specified by col_offset
-  // (see computeBaseIndices(...) for details).
-  Index m_rowIndex;
-  Index m_colIndex;
-  Index m_otherIndex;
-
-  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
-                                     // performs better in benchmarks.
-};
-
-// Arrange a block of the right input matrix (in our case it's always a "virtual
-// matrix" constructed from extracted image patches) in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0  E0 F0 G0 H0 ... Z0
-// A1 B1 C1 D1  E1 F1 G1 H1 ... Z1
-// A2 B2 C2 D2  E2 F2 G2 H2 ... Z2
-// A3 B3 C3 D3  E3 F3 G3 H3 ... Z3
-// A4 B4 C4 D4  E4 F4 G4 H4 ... Z4
-// A5 B5 C5 D5  E5 F5 G5 H5 ... Z5
-// A6 B6 C6 D6  E6 F6 G6 H6 ... Z6
-// A7 B7 C7 D7  E7 F7 G7 H7 ... Z7
-// A8 ...
-// ...
-//
-// *) A, B, C, ... - patches extracted from the original input.
-// *) A0, A1, A2 ... - values from the same patch at different offsets.
-//
-// The traversal (packed rhs memory) order (B0 besides A0 in memory):
-// A0 B0 C0 D0 A1 B1 C1 D1 ...
-// E0 F0 G0 H0 E1 F1 G1 H1 ...
-// ...
-// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4)
-//
-// This traversal order must be the same as in default gemm_pack_rhs defined in
-// GeneralBlockPanelKernel.h.
-//
-// *) nr - number of registers along the 'n' dimension.
-//    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
-//    Multiplication" paper.
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          int nr>
-struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<
-                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-            Device>,
-        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-        inner_dim_reordered, Alignment>,
-    nr, ColMajor, false, false> {
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      SubMapper;
-  typedef SubMapper DataMapper;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    const Index packet_cols4 = (cols / 4) * 4;
-    const Index peeled_k = (depth / packet_size) * packet_size;
-    const bool non_standard_patches = rhs.nonStandardPatches();
-
-    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      Index k = 0;
-      if ((packet_size % 4) == 0 && !non_standard_patches) {
-        // FAST PATH:
-        // Iterate over patch columns and rows, if we know that a single
-        // packet do not span across multiple rows or columns.
-        if ((rhs.patchDepth() % packet_size) == 0) {
-          const Index start_col = rhs.colOffset();
-          const Index max_col = rhs.maxCol(peeled_k);
-
-          for (Index c = start_col; c < max_col; ++c) {
-            eigen_assert(k <= peeled_k);
-
-            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
-            const Index max_row = rhs.maxRow(peeled_k, c);
-
-            const bool pad_col0 = dm0.padCol(c);
-            const bool pad_col1 = dm1.padCol(c);
-            const bool pad_col2 = dm2.padCol(c);
-            const bool pad_col3 = dm3.padCol(c);
-
-            // Check if we can squeeze reads along the `row` and `depth`
-            // dimensions (two innermost dimensions).
-            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
-                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
-                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
-                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
-                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
-              // Compute how many elements we can squeeze read.
-              const Index start_depth =
-                  (c == start_col) ? rhs.depthOffset() : 0;
-
-              // Upper bound for the number of elements in the depth dimension
-              // that we can squeeze read.
-              const Index squeeze_length =
-                  (max_row - start_row) * rhs.patchDepth() - start_depth;
-
-              // Do not overshoot beyond the block size.
-              const Index max_depth =
-                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              const Index idx0 = dm0.baseIndex(start_row, c);
-              const Index idx1 = dm1.baseIndex(start_row, c);
-              const Index idx2 = dm2.baseIndex(start_row, c);
-              const Index idx3 = dm3.baseIndex(start_row, c);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 4> kernel;
-                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
-                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
-                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
-                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel);
-                pstoreu(block + 0 * packet_size, kernel.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel.packet[1]);
-                pstoreu(block + 2 * packet_size, kernel.packet[2]);
-                pstoreu(block + 3 * packet_size, kernel.packet[3]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-
-              // Go to the next column.
-              continue;
-            }
-
-            // If we can't squeeze reads, process rows one by one.
-            for (Index r = start_row; r < max_row; ++r) {
-              eigen_assert(k <= peeled_k);
-
-              const bool pad0 = pad_col0 || dm0.padRow(r);
-              const bool pad1 = pad_col1 || dm1.padRow(r);
-              const bool pad2 = pad_col2 || dm2.padRow(r);
-              const bool pad3 = pad_col3 || dm3.padRow(r);
-
-              const Index idx0 = dm0.baseIndex(r, c);
-              const Index idx1 = dm1.baseIndex(r, c);
-              const Index idx2 = dm2.baseIndex(r, c);
-              const Index idx3 = dm3.baseIndex(r, c);
-
-              const Index start_depth = ((c == start_col) && (r == start_row))
-                                            ? rhs.depthOffset()
-                                            : 0;
-              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 4> kernel;
-                kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx0);
-                kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx1);
-                kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx2);
-                kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel);
-                pstoreu(block + 0 * packet_size, kernel.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel.packet[1]);
-                pstoreu(block + 2 * packet_size, kernel.packet[2]);
-                pstoreu(block + 3 * packet_size, kernel.packet[3]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-            }
-          }
-
-          // The loop above should fill peeled_k elements.
-          eigen_assert(peeled_k == k);
-
-        } else {
-          for (; k < peeled_k; k += packet_size) {
-            PacketBlock<Packet, 4> kernel;
-            kernel.packet[0] = dm0.loadPacketStandard(k);
-            kernel.packet[1] = dm1.loadPacketStandard(k);
-            kernel.packet[2] = dm2.loadPacketStandard(k);
-            kernel.packet[3] = dm3.loadPacketStandard(k);
-            ptranspose(kernel);
-            pstoreu(block + 0 * packet_size, kernel.packet[0]);
-            pstoreu(block + 1 * packet_size, kernel.packet[1]);
-            pstoreu(block + 2 * packet_size, kernel.packet[2]);
-            pstoreu(block + 3 * packet_size, kernel.packet[3]);
-            block += 4 * packet_size;
-          }
-        }
-      }
-
-      // Copy the remaining coefficients of the column block after the peeled_k.
-      if (!rhs.nonStandardPatches()) {
-        for (; k < depth; k++) {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      } else {
-        for (; k < depth; k++) {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // copy the remaining columns one at a time (nr==1)
-    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for (Index k = 0; k < depth; k++) {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-
-// Template specialization for packet_size = 2. We must special-case packet
-// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
-          bool inner_dim_reordered, int Alignment, int nr>
-struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<
-                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-            Device>,
-        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
-        Alignment>,
-    nr, ColMajor, false, false> {
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
-      Alignment>
-      SubMapper;
-  typedef SubMapper DataMapper;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    const int packet_size = 2;
-    const Index packet_cols4 = (cols / 4) * 4;
-    const Index peeled_k = (depth / packet_size) * packet_size;
-    const bool non_standard_patches = rhs.nonStandardPatches();
-
-    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      Index k = 0;
-      if (!non_standard_patches) {
-        // FAST PATH:
-        // Iterate over patch columns and rows if we know that a single
-        // packet do not span across multiple rows or columns.
-        if ((rhs.patchDepth() % packet_size) == 0) {
-          const Index start_col = rhs.colOffset();
-          const Index max_col = rhs.maxCol(peeled_k);
-
-          for (Index c = start_col; c < max_col; ++c) {
-            eigen_assert(k <= peeled_k);
-
-            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
-            const Index max_row = rhs.maxRow(peeled_k, c);
-
-            const bool pad_col0 = dm0.padCol(c);
-            const bool pad_col1 = dm1.padCol(c);
-            const bool pad_col2 = dm2.padCol(c);
-            const bool pad_col3 = dm3.padCol(c);
-
-            // We can squeeze reads along the `row` and `depth` dimensions if
-            // the row stride is `1`, which means that `row` and `depth`
-            // dimensions are contiguous (two innermost dimensions).
-            if (rhs.rowStride() == 1 &&                                //
-                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
-                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
-                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
-                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
-                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
-              // Compute how many elements we can squeeze read.
-              const Index start_depth =
-                  (c == start_col) ? rhs.depthOffset() : 0;
-
-              // Upper bound for the number of elements in the depth dimension
-              // that we can squeeze read.
-              const Index squeeze_length =
-                  (max_row - start_row) * rhs.patchDepth() - start_depth;
-
-              // Do not overshoot beyond the block size.
-              const Index max_depth =
-                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              const Index idx0 = dm0.baseIndex(start_row, c);
-              const Index idx1 = dm1.baseIndex(start_row, c);
-              const Index idx2 = dm2.baseIndex(start_row, c);
-              const Index idx3 = dm3.baseIndex(start_row, c);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                PacketBlock<Packet, 2> kernel0;
-                PacketBlock<Packet, 2> kernel1;
-                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
-                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
-                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
-                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel0);
-                ptranspose(kernel1);
-                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
-                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
-                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-
-              // Go to the next column.
-              continue;
-            }
-
-            // If we can't squeeze reads, process rows one by one.
-            for (Index r = start_row; r < max_row; ++r) {
-              eigen_assert(k <= peeled_k);
-
-              const bool pad0 = pad_col0 || dm0.padRow(r);
-              const bool pad1 = pad_col1 || dm1.padRow(r);
-              const bool pad2 = pad_col2 || dm2.padRow(r);
-              const bool pad3 = pad_col3 || dm3.padRow(r);
-
-              const Index idx0 = dm0.baseIndex(r, c);
-              const Index idx1 = dm1.baseIndex(r, c);
-              const Index idx2 = dm2.baseIndex(r, c);
-              const Index idx3 = dm3.baseIndex(r, c);
-
-              const Index start_depth = ((c == start_col) && (r == start_row))
-                                            ? rhs.depthOffset()
-                                            : 0;
-              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 2> kernel0;
-                PacketBlock<Packet, 2> kernel1;
-                kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx0);
-                kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx1);
-                kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx2);
-                kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel0);
-                ptranspose(kernel1);
-                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
-                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
-                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-            }
-          }
-
-          // The loop above should fill peeled_k elements.
-          eigen_assert(peeled_k == k);
-
-        } else {
-          // Packet can span multiple rows or columns, so we have to go
-          // though the slower "standard" path.
-          for (; k < peeled_k; k += packet_size) {
-            PacketBlock<Packet, 2> kernel0;
-            PacketBlock<Packet, 2> kernel1;
-            kernel0.packet[0] = dm0.loadPacketStandard(k);
-            kernel0.packet[1] = dm1.loadPacketStandard(k);
-            kernel1.packet[0] = dm2.loadPacketStandard(k);
-            kernel1.packet[1] = dm3.loadPacketStandard(k);
-            ptranspose(kernel0);
-            ptranspose(kernel1);
-            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
-            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
-            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
-            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
-            block += 4 * packet_size;
-          }
-        }
-      }
-
-      // Copy the remaining coefficients of the column block after the peeled_k.
-      if (!non_standard_patches) {
-        for (; k < depth; k++) {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      } else {
-        for (; k < depth; k++) {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // Copy the remaining columns one at a time (nr==1).
-    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for (Index k = 0; k < depth; k++) {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-
-// Special case for non-vectorized types such as float16.
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
-          bool inner_dim_reordered, int Alignment, int nr>
-struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<
-                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-            Device>,
-        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
-        Alignment>,
-    nr, ColMajor, false, false> {
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
-      Alignment>
-      SubMapper;
-  typedef SubMapper DataMapper;
-
-  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    const Index packet_cols4 = (cols / 4) * 4;
-
-    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      if (!rhs.nonStandardPatches()) {
-        for (Index k = 0; k < depth; k++) {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      } else {
-        for (Index k = 0; k < depth; k++) {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // Copy the remaining columns one at a time (nr==1).
-    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for (Index k = 0; k < depth; k++) {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-
-#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 // Pack a block of the right input matrix (in our case it's always a
 // "virtual matrix" constructed from extracted image patches) in contiguous
 // block in column-major storage order. Knowing the properties of the
@@ -1500,204 +221,12 @@
     }
   }
 };
+}  // end namespace internal
+}  // end namespace Eigen
 #endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
 
-}  // end namespace internal
-
-/** SpatialConvolution
- * \ingroup CXX11_NeuralNetworks_Module
- *
- * \brief Applies a 2D convolution over a multichannel input image.
- *
- * The input parameter is expected to be a tensor with a rank of 3 or more
- * (channels, height, width, and optionally others)
- * The kernel parameter is expected to be a 4D tensor (filters, channels,
- * kernel_height, kernel_width)
- * The input and the kernel must both be in col-major layout. The result will
- * also be in col-major layout.
- *
- * If col_in_stride, row_in_stride > 1, then applies convolution with holes
- * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
- * pixels.
- *
- * The result can be assigned to a tensor of rank equal to the rank of the
- * input. The dimensions of the result will be filters, height, width (and
- * others if applicable).
- *
- * It is possible to swap the order of the width and height dimensions provided
- * that the same order is used in the input, the kernel, and the output.
- *
- * It is also possible to add an output kernel to the contraction, output
- * kernel is called by Eigen when it "finalizes" the block of an output tensor.
- *
- */
-template <typename Input, typename Kernel,
-          typename OutputKernel = const NoOpOutputKernel>
-EIGEN_DEVICE_FUNC
-    EIGEN_ALWAYS_INLINE static const typename internal::conditional<
-        internal::traits<Input>::Layout == ColMajor,
-        TensorReshapingOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorContractionOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            1>,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel>,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
-                const OutputKernel> >,
-        TensorReshapingOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorContractionOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            1>,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel>,
-                const OutputKernel> > >::type
-    SpatialConvolution(const Input& input, const Kernel& kernel,
-                       const Index row_stride = 1, const Index col_stride = 1,
-                       const PaddingType padding_type = PADDING_SAME,
-                       const Index row_in_stride = 1,
-                       const Index col_in_stride = 1,
-                       const OutputKernel& output_kernel = OutputKernel()) {
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
-                   internal::traits<Input>::NumDimensions,
-                   internal::traits<Input>::Layout, TensorIndex> >
-      in(input);
-  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
-                   internal::traits<Kernel>::NumDimensions,
-                   internal::traits<Kernel>::Layout, TensorIndex> >
-      kern(kernel);
-
-  EIGEN_STATIC_ASSERT(
-      internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
-      YOU_MADE_A_PROGRAMMING_MISTAKE)
-  const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-
-  const int NumDims = internal::traits<Input>::NumDimensions;
-
-  // Number of filters to apply. This is the same as the output depth of the
-  // result
-  const TensorIndex kernelFilters =
-      isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
-  // Number of channels. This is the same as the input depth.
-  const TensorIndex kernelChannels =
-      isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
-  const TensorIndex kernelRows =
-      isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
-  const TensorIndex kernelCols =
-      isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
-
-  const Index kernelRowsEff =
-      kernelRows + (kernelRows - 1) * (row_in_stride - 1);
-  const Index kernelColsEff =
-      kernelCols + (kernelCols - 1) * (col_in_stride - 1);
-
-  array<IndexPair<TensorIndex>, 1> contract_dims;
-  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
-
-  const TensorIndex InputRows =
-      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex InputCols =
-      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-
-  TensorIndex out_height;
-  TensorIndex out_width;
-  switch (padding_type) {
-    case PADDING_VALID:
-      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) /
-                                static_cast<float>(row_stride));
-      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) /
-                               static_cast<float>(col_stride));
-      break;
-    case PADDING_SAME:
-      out_height = numext::ceil(InputRows / static_cast<float>(row_stride));
-      out_width = numext::ceil(InputCols / static_cast<float>(col_stride));
-      break;
-    default:
-      // Initialize unused variables to avoid a compiler warning
-      out_height = 0;
-      out_width = 0;
-      eigen_assert(false && "unexpected padding");
-  }
-
-  // Molds the output of the patch extraction code into a 2d tensor:
-  // - the first dimension (dims[0]): the patch values to be multiplied with the
-  // kernels
-  // - the second dimension (dims[1]): everything else
-  DSizes<TensorIndex, 2> pre_contract_dims;
-  if (isColMajor) {
-    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
-    pre_contract_dims[1] = out_height * out_width;
-    for (int i = 3; i < NumDims; ++i) {
-      pre_contract_dims[1] *= in.dimension(i);
-    }
-  } else {
-    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
-    pre_contract_dims[0] = out_height * out_width;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      pre_contract_dims[0] *= in.dimension(i);
-    }
-  }
-
-  // Molds the output of the contraction into the shape expected by the used
-  // (assuming this is ColMajor):
-  // - 1st dim: kernel filters
-  // - 2nd dim: output height
-  // - 3rd dim: output width
-  // - 4th dim and beyond: everything else including batch size
-  DSizes<TensorIndex, NumDims> post_contract_dims;
-  if (isColMajor) {
-    post_contract_dims[0] = kernelFilters;
-    post_contract_dims[1] = out_height;
-    post_contract_dims[2] = out_width;
-    for (int i = 3; i < NumDims; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  } else {
-    post_contract_dims[NumDims - 1] = kernelFilters;
-    post_contract_dims[NumDims - 2] = out_height;
-    post_contract_dims[NumDims - 3] = out_width;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  }
-
-  DSizes<TensorIndex, 2> kernel_dims;
-  if (isColMajor) {
-    kernel_dims[0] = kernelFilters;
-    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
-  } else {
-    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
-    kernel_dims[1] = kernelFilters;
-  }
-  return choose(
-      Cond<internal::traits<Input>::Layout == ColMajor>(),
-      kernel.reshape(kernel_dims)
-          .contract(input
-                        .extract_image_patches(
-                            kernelRows, kernelCols, row_stride, col_stride,
-                            row_in_stride, col_in_stride, padding_type)
-                        .reshape(pre_contract_dims),
-                    contract_dims, output_kernel)
-          .reshape(post_contract_dims),
-      input
-          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                 row_in_stride, col_in_stride, padding_type)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-          .reshape(post_contract_dims));
-}
-
-}  // end namespace Eigen
+// Note the following header is used in both TF and TFLite. Particularly, it's
+// used for float TFLite Conv2D.
+#include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h"
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/core/kernels/host_constant_op.cc b/tensorflow/core/kernels/host_constant_op.cc
index d08a7c9..17dad52 100644
--- a/tensorflow/core/kernels/host_constant_op.cc
+++ b/tensorflow/core/kernels/host_constant_op.cc
@@ -63,8 +63,6 @@
 #endif  // TENSORFLOW_USE_SYCL
 
 // HostConst: forced to generate output on the host.
-// Only used in tests; no op is registered for this kernel
-// externally (i.e., in array_ops.cc)
 REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), _HostConstantOp);
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), _HostConstantOp);
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 26e4212..fcca2f7 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -914,13 +914,6 @@
                     "Pooling is not yet supported on the batch dimension."));
     OP_REQUIRES_OK(context, context->GetAttr("include_batch_in_index",
                                              &include_batch_in_index_));
-    if (context->device_type() == DeviceType(DEVICE_GPU)) {
-      OP_REQUIRES(context, include_batch_in_index_ == false,
-                  errors::Unimplemented(
-                      "include_batch_in_index=true is not yet supported "
-                      "on the GPU kernel."));
-    }
-
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
   }
@@ -1313,7 +1306,7 @@
         params.out_width, params.window_rows, params.window_cols,
         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
         output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
-        propagate_nans);
+        propagate_nans, false);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolForwardNoMask"));
@@ -1326,10 +1319,6 @@
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& input, Tensor* output, Tensor* argmax,
                      bool propagate_nans, bool include_batch_in_index) {
-    OP_REQUIRES(context, include_batch_in_index == false,
-                errors::Unimplemented(
-                    "include_batch_in_index=true is not yet supported "
-                    "on the GPU kernel."));
     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
@@ -1337,7 +1326,7 @@
         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
         output->flat<T>().data(),
         reinterpret_cast<int64*>(argmax->flat<int64>().data()),
-        context->eigen_gpu_device(), propagate_nans);
+        context->eigen_gpu_device(), propagate_nans, include_batch_in_index);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
@@ -1350,10 +1339,6 @@
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& grad_in, const Tensor& argmax,
                      Tensor* grad_out, const bool include_batch_in_index) {
-    OP_REQUIRES(context, include_batch_in_index == false,
-                errors::Unimplemented(
-                    "include_batch_in_index=true is not yet supported "
-                    "on the GPU kernel."));
     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
                            params.tensor_in_cols * params.depth;
     const int output_size = params.tensor_in_batch * params.out_height *
@@ -1364,7 +1349,8 @@
     bool status = functor::MaxPoolBackwardWithArgmax<T>()(
         output_size, input_size, grad_in.flat<T>().data(),
         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
-        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
+        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
+        include_batch_in_index);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolBackwardWithArgmax"));
@@ -1377,10 +1363,6 @@
   static void launch(OpKernelContext* context, const PoolParameters& params,
                      const Tensor& grad_in, const Tensor& argmax,
                      Tensor* grad_out, const bool include_batch_in_index) {
-    OP_REQUIRES(context, include_batch_in_index == false,
-                errors::Unimplemented(
-                    "include_batch_in_index=true is not yet supported "
-                    "on the GPU kernel."));
     const int input_size = params.tensor_in_batch * params.tensor_in_rows *
                            params.tensor_in_cols * params.depth;
     const int output_size = params.tensor_in_batch * params.out_height *
@@ -1392,7 +1374,8 @@
     bool status = functor::MaxPoolGradBackwardWithArgmax<T>()(
         output_size, input_size, grad_in.flat<T>().data(),
         reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
-        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
+        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device(),
+        include_batch_in_index);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax"));
@@ -1473,32 +1456,32 @@
 // default Eigen implementation so we are using the custom kernel as the
 // default. However, you can explicitly invoke the eigen version using
 // kernel_label_map.
-#define REGISTER_GPU_ONLY_POOL_KERNELS(T)                            \
-  REGISTER_KERNEL_BUILDER(Name("MaxPool")                            \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .Label("eigen_tensor"),                \
-                          MaxPoolingOp<GPUDevice, T>);               \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                          \
-                              .Device(DEVICE_GPU)                    \
-                              .HostMemory("ksize")                   \
-                              .HostMemory("strides")                 \
-                              .TypeConstraint<T>("T")                \
-                              .Label("eigen_tensor"),                \
-                          MaxPoolingV2Op<GPUDevice, T>);             \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
-      MaxPoolingNoMaskOp<GPUDevice, T>);                             \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                          \
-                              .Device(DEVICE_GPU)                    \
-                              .HostMemory("ksize")                   \
-                              .HostMemory("strides")                 \
-                              .TypeConstraint<T>("T"),               \
-                          MaxPoolingNoMaskV2Op<GPUDevice, T>);       \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<int64>("Targmax"),     \
+#define REGISTER_GPU_ONLY_POOL_KERNELS(T)                        \
+  REGISTER_KERNEL_BUILDER(Name("MaxPool")                        \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<T>("T")            \
+                              .Label("eigen_tensor"),            \
+                          MaxPoolingOp<GPUDevice, T>);           \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                      \
+                              .Device(DEVICE_GPU)                \
+                              .HostMemory("ksize")               \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<T>("T")            \
+                              .Label("eigen_tensor"),            \
+                          MaxPoolingV2Op<GPUDevice, T>);         \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      MaxPoolingNoMaskOp<GPUDevice, T>);                         \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                      \
+                              .Device(DEVICE_GPU)                \
+                              .HostMemory("ksize")               \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<T>("T"),           \
+                          MaxPoolingNoMaskV2Op<GPUDevice, T>);   \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")      \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<T>("T")            \
+                              .TypeConstraint<int64>("Targmax"), \
                           MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index f28811f..1309ce7 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -54,6 +54,8 @@
 //         int form, keeping track of the flattened index of the input item that
 //         produces the max output. If a nullptr is passed in for mask, no mask
 //         will be produced.
+//     include_batch_in_index: whether to include batch dimension in flattened
+//         index of `argmax`.
 //
 // To call the forward and backward functions, use e.g.:
 // const int kThreadsPerBlock = 1024
@@ -61,14 +63,12 @@
 // MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
 //                      kThreadsPerBlock, 0, cuda_stream>>>(...);
 template <bool propagate_nans, typename dtype>
-__global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
-                                   const int channels, const int height,
-                                   const int width, const int pooled_height,
-                                   const int pooled_width, const int kernel_h,
-                                   const int kernel_w, const int stride_h,
-                                   const int stride_w, const int pad_t,
-                                   const int pad_l, dtype* top_data,
-                                   int64* mask) {
+__global__ void MaxPoolForwardNCHW(
+    const int nthreads, const dtype* bottom_data, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    dtype* top_data, int64* mask, const bool include_batch_in_index) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -82,12 +82,13 @@
     wstart = max(wstart, 0);
     dtype maxval = Eigen::NumTraits<dtype>::lowest();
     int maxidx = -1;
-    const dtype* bottom_data_n = bottom_data + n * channels * height * width;
+    const int offset = n * channels * height * width;
+    const dtype* bottom_data_n = bottom_data + offset;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = c * height * width + h * width + w;
         if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
-          maxidx = idx;
+          maxidx = include_batch_in_index ? idx + offset : idx;
           maxval = bottom_data_n[idx];
         }
       }
@@ -136,14 +137,12 @@
 }
 
 template <bool propagate_nans, typename dtype>
-__global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
-                                   const int height, const int width,
-                                   const int channels, const int pooled_height,
-                                   const int pooled_width, const int kernel_h,
-                                   const int kernel_w, const int stride_h,
-                                   const int stride_w, const int pad_t,
-                                   const int pad_l, dtype* top_data,
-                                   int64* mask) {
+__global__ void MaxPoolForwardNHWC(
+    const int nthreads, const dtype* bottom_data, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    dtype* top_data, int64* mask, const bool include_batch_in_index) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -158,12 +157,13 @@
     wstart = max(wstart, 0);
     dtype maxval = Eigen::NumTraits<dtype>::lowest();
     int maxidx = -1;
-    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
+    const int offset = n * height * width * channels;
+    const dtype* bottom_data_n = bottom_data + offset;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = (h * width + w) * channels + c;
         if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
-          maxidx = idx;
+          maxidx = include_batch_in_index ? idx + offset : idx;
           maxval = bottom_data_n[idx];
         }
       }
@@ -231,17 +231,20 @@
 //     bottom_offset: the pre-computed per-image offset of the maxpool input.
 //         This is equal to H*W*C.
 //     bottom_diff: the gradient with respect to the input.
+//     include_batch_in_index: whether to include batch dimension in flattened
+//         index of `argmax`.
 // This function relies on CudaAtomicAdd to avoid race conditions. Also, before
 // the kernel is run, you will need to make sure that bottom_diff is filled with
 // zero first.
 template <typename dtype>
 __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
                                 const int64* mask, const int top_offset,
-                                const int bottom_offset, dtype* bottom_diff) {
+                                const int bottom_offset, dtype* bottom_diff,
+                                const bool include_batch_in_index) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int image_id = (index / top_offset);
-    CudaAtomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
-                  top_diff[index]);
+    const int offset =
+        include_batch_in_index ? 0 : (index / top_offset) * bottom_offset;
+    CudaAtomicAdd(bottom_diff + offset + mask[index], top_diff[index]);
   }
 }
 
@@ -358,14 +361,17 @@
 //     bottom_offset: the pre-computed per-image offset of the maxpool output.
 //         This is equal to Hout*Wout*C.
 //     bottom_diff: the gradient of the gradient w.r.t. output.
+//     include_batch_in_index: whether to include batch dimension in flattened
+//         index of `argmax`.
 template <typename dtype>
 __global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
                                     const int64* mask, const int top_offset,
-                                    const int bottom_offset,
-                                    dtype* bottom_diff) {
+                                    const int bottom_offset, dtype* bottom_diff,
+                                    const bool include_batch_in_index) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int image_id = (index / bottom_offset);
-    bottom_diff[index] = top_diff[image_id * top_offset + mask[index]];
+    const int offset =
+        include_batch_in_index ? 0 : (index / bottom_offset) * top_offset;
+    bottom_diff[index] = top_diff[offset + mask[index]];
   }
 }
 
@@ -399,7 +405,8 @@
     const int channels, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_t, const int pad_l, T* top_data,
-    int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) {
+    int64* mask, const Eigen::GpuDevice& d, bool propagate_nans,
+    const bool include_batch_in_index) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
   if (output_size == 0) return true;
@@ -409,14 +416,14 @@
            kThreadsPerBlock, 0, d.stream()>>>(
             output_size, bottom_data, height, width, channels, pooled_height,
             pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-            top_data, mask);
+            top_data, mask, include_batch_in_index);
   } else {
     MaxPoolForwardNHWC<false>
         <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
            kThreadsPerBlock, 0, d.stream()>>>(
             output_size, bottom_data, height, width, channels, pooled_height,
             pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-            top_data, mask);
+            top_data, mask, include_batch_in_index);
   }
   return d.ok();
 }
@@ -449,14 +456,16 @@
 bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
     const int64* mask, const int top_offset, const int bottom_offset,
-    T* bottom_diff, const Eigen::GpuDevice& d) {
+    T* bottom_diff, const Eigen::GpuDevice& d,
+    const bool include_batch_in_index) {
   const int kThreadsPerBlock = 1024;
   if (input_size == 0) return true;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                     kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff,
+      include_batch_in_index);
   return d.ok();
 }
 
@@ -492,12 +501,14 @@
 bool MaxPoolGradBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
     const int64* mask, const int top_offset, const int bottom_offset,
-    T* bottom_diff, const Eigen::GpuDevice& d) {
+    T* bottom_diff, const Eigen::GpuDevice& d,
+    const bool include_batch_in_index) {
   if (input_size == 0) return true;
   CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
   MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
                         d.stream()>>>(output_size, top_diff, mask, top_offset,
-                                      bottom_offset, bottom_diff);
+                                      bottom_offset, bottom_diff,
+                                      include_batch_in_index);
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 38ebb34..c18c489 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -39,7 +39,8 @@
                   const int pooled_width, const int kernel_h,
                   const int kernel_w, const int stride_h, const int stride_w,
                   const int pad_t, const int pad_l, T* top_data, int64* mask,
-                  const Eigen::GpuDevice& d, bool propagate_nans);
+                  const Eigen::GpuDevice& d, bool propagate_nans,
+                  const bool include_batch_in_index);
 };
 
 struct MaxPoolForwardNoMask_NCHW_VECT_C {
@@ -56,7 +57,7 @@
   bool operator()(const int output_size, const int input_size,
                   const T* top_diff, const int64* mask, const int top_offset,
                   const int bottom_offset, T* bottom_diff,
-                  const Eigen::GpuDevice& d);
+                  const Eigen::GpuDevice& d, const bool include_batch_in_index);
 };
 
 template <typename T>
@@ -74,7 +75,7 @@
   bool operator()(const int output_size, const int input_size,
                   const T* top_diff, const int64* mask, const int top_offset,
                   const int bottom_offset, T* bottom_diff,
-                  const Eigen::GpuDevice& d);
+                  const Eigen::GpuDevice& d, const bool include_batch_in_index);
 };
 
 template <typename T>
diff --git a/tensorflow/core/kernels/stateful_random_ops.cc b/tensorflow/core/kernels/stateful_random_ops.cc
index b664bf1..1312593 100644
--- a/tensorflow/core/kernels/stateful_random_ops.cc
+++ b/tensorflow/core/kernels/stateful_random_ops.cc
@@ -18,6 +18,7 @@
 #include "tensorflow/core/kernels/random_op.h"
 #include "tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 
@@ -142,6 +143,44 @@
   }
 };
 
+template <typename T>
+class NonDeterministicIntsOp : public OpKernel {
+ public:
+  explicit NonDeterministicIntsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape_t = ctx->input(0);
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->op_kernel().MakeShape(shape_t, &shape));
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
+    if (shape.num_elements() == 0) return;
+
+    switch (dtype_) {
+      case DT_INT32:
+      case DT_UINT32:
+      case DT_INT64:
+      case DT_UINT64: {
+        auto output_flat = output->flat<T>();
+        auto data = output_flat.data();
+        for (int64 i = 0; i < output_flat.size(); ++i) {
+          data[i] = static_cast<T>(random::New64());
+        }
+        break;
+      }
+      default:
+        OP_REQUIRES(ctx, false,
+                    errors::InvalidArgument("Unsupported dtype: ",
+                                            DataTypeString(dtype_)));
+    }
+  }
+
+ private:
+  DataType dtype_;
+};
+
 // So far the 'Distribution' type parameter is only used when the algorithm is
 // philox, so 'NormalDistribution<PhiloxRandom, ...>' is fine for now.
 #define REGISTER(DEVICE, TYPE)            \
@@ -186,7 +225,20 @@
 #undef REGISTER_CPU
 #undef REGISTER
 
+#define REGISTER_NonDeterministicInts(TYPE)                   \
+  REGISTER_KERNEL_BUILDER(Name("NonDeterministicInts")        \
+                              .Device(DEVICE_CPU)             \
+                              .HostMemory("shape")            \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          NonDeterministicIntsOp<TYPE>);
+
+TF_CALL_int32(REGISTER_NonDeterministicInts);
+TF_CALL_uint32(REGISTER_NonDeterministicInts);
+TF_CALL_int64(REGISTER_NonDeterministicInts);
+TF_CALL_uint64(REGISTER_NonDeterministicInts);
+
+#undef REGISTER_NonDeterministicInts
+
 // TODO(wangpeng): Add RNG ops for other distributions.
-// TODO(wangpeng): Add support for XLA.
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 5ff462c..fc06f26 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -41749,6 +41749,32 @@
   name: "NoOp"
 }
 op {
+  name: "NonDeterministicInts"
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
   name: "NonMaxSuppression"
   input_arg {
     name: "boxes"
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 5f556a5..7936c4e 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -20734,6 +20734,32 @@
   name: "NoOp"
 }
 op {
+  name: "NonDeterministicInts"
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
   name: "NonMaxSuppression"
   input_arg {
     name: "boxes"
diff --git a/tensorflow/core/ops/stateful_random_ops.cc b/tensorflow/core/ops/stateful_random_ops.cc
index cf35eb7..643b3e9 100644
--- a/tensorflow/core/ops/stateful_random_ops.cc
+++ b/tensorflow/core/ops/stateful_random_ops.cc
@@ -66,7 +66,21 @@
       return Status::OK();
     });
 
-// Register the old 'StatefulStandardNormal' op. This op is a short-lived
+REGISTER_OP("NonDeterministicInts")
+    .Input("shape: shape_dtype")
+    .SetIsStateful()
+    .Output("output: dtype")
+    .Attr("dtype : type = DT_INT64")
+    .Attr("shape_dtype : type = DT_INT64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      using shape_inference::ShapeHandle;
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
+// Register the depracated 'StatefulStandardNormal' op. This op is a short-lived
 // version where the 'resource' variable also contains the algorithm tag.
 // It is deprecated in favor of 'StatefulStandardNormalV2'.
 REGISTER_OP("StatefulStandardNormal")
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index ab05b25..4e7a35b 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -4,9 +4,13 @@
 
 load("@local_config_remote_execution//:remote_execution.bzl", "gpu_test_tags")
 
-def tf_cuda_tests_tags():
+def tf_gpu_tests_tags():
     return ["requires-gpu", "gpu"] + gpu_test_tags()
 
+# terminology changes: saving tf_cuda_* for compatibility
+def tf_cuda_tests_tags():
+    return tf_gpu_tests_tags()
+
 def tf_sycl_tests_tags():
     return ["requires-gpu", "gpu"] + gpu_test_tags()
 
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
index 0841593..dc0126a 100644
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -92,17 +92,17 @@
     ],
 )
 
+# NOTE: Split SavedModelTest due to Forge input size limit.
+
 py_test(
-    name = "saved_model_test",
+    name = "saved_model_part1_test",
     srcs = [
-        "saved_model_test.py",
+        "saved_model_part1_test.py",
     ],
     data = [
-        ":export_mnist_cnn",
         ":export_rnn_cell",
         ":export_simple_text_embedding",
         ":export_text_rnn_model",
-        ":use_mnist_cnn",
         ":use_model_in_sequential_keras",
         ":use_rnn_cell",
         ":use_text_rnn_model",
@@ -110,6 +110,28 @@
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
+        "no_cuda_on_cpu_tap",  # forge input size exceeded
+        "noasan",  # forge input size exceeded
+        "nomsan",  # forge input size exceeded
+        "notsan",  # forge input size exceeded
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "saved_model_part2_test",
+    srcs = [
+        "saved_model_part2_test.py",
+    ],
+    data = [
+        ":export_mnist_cnn",
+        ":use_mnist_cnn",
+    ],
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    tags = [
         "noasan",  # forge input size exceeded
         "nomsan",  # forge input size exceeded
         "notsan",  # forge input size exceeded
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_part1_test.py
similarity index 83%
rename from tensorflow/examples/saved_model/integration_tests/saved_model_test.py
rename to tensorflow/examples/saved_model/integration_tests/saved_model_part1_test.py
index 6ec387e..94f1444 100644
--- a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_part1_test.py
@@ -28,7 +28,7 @@
 from tensorflow.python.platform import tf_logging as logging
 
 
-class SavedModelTest(tf.test.TestCase):
+class SavedModelPart1Test(tf.test.TestCase):
 
   def assertCommandSucceeded(self, binary, **flags):
     command_parts = [binary]
@@ -70,17 +70,6 @@
         "use_model_in_sequential_keras")
     self.assertCommandSucceeded(use_binary, model_dir=export_dir)
 
-  @test_util.run_v2_only
-  def test_mnist_cnn(self):
-    export_dir = self.get_temp_dir()
-    export_binary = resource_loader.get_path_to_datafile("export_mnist_cnn")
-    self.assertCommandSucceeded(export_binary, export_dir=export_dir,
-                                fast_test_mode="true")
-
-    use_binary = resource_loader.get_path_to_datafile("use_mnist_cnn")
-    self.assertCommandSucceeded(use_binary, export_dir=export_dir,
-                                fast_test_mode="true")
-
 if __name__ == "__main__":
   tf.enable_v2_behavior()
   tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_part2_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_part2_test.py
new file mode 100644
index 0000000..e357755
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_part2_test.py
@@ -0,0 +1,56 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel integration test for MNIST."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import subprocess
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import tf_logging as logging
+
+
+class SavedModelPart2Test(tf.test.TestCase):
+
+  def assertCommandSucceeded(self, binary, **flags):
+    command_parts = [binary]
+    for flag_key, flag_value in flags.items():
+      command_parts.append("--%s=%s" % (flag_key, flag_value))
+
+    logging.info("Running: %s", command_parts)
+    subprocess.check_call(
+        command_parts, env=dict(os.environ, TF2_BEHAVIOR="enabled"))
+
+  @test_util.run_v2_only
+  def test_mnist_cnn(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile("export_mnist_cnn")
+    self.assertCommandSucceeded(
+        export_binary, export_dir=export_dir, fast_test_mode="true")
+
+    use_binary = resource_loader.get_path_to_datafile("use_mnist_cnn")
+    self.assertCommandSucceeded(
+        use_binary, export_dir=export_dir, fast_test_mode="true")
+
+
+if __name__ == "__main__":
+  tf.enable_v2_behavior()
+  tf.test.main()
diff --git a/tensorflow/examples/speech_commands/models.py b/tensorflow/examples/speech_commands/models.py
index 3b96a22..d368fec 100644
--- a/tensorflow/examples/speech_commands/models.py
+++ b/tensorflow/examples/speech_commands/models.py
@@ -530,6 +530,10 @@
       shape=[num_filters, batch, input_time_size],
       trainable=False,
       name='runtime-memory')
+  first_time_flag = tf.get_variable(
+      name="first_time_flag",
+      dtype=tf.int32,
+      initializer=1)
   # Determine the number of new frames in the input, such that we only operate
   # on those. For training we do not use the memory, and thus use all frames
   # provided in the input.
@@ -540,9 +544,10 @@
     window_stride_ms = int(model_settings['window_stride_samples'] * 1000 /
                            model_settings['sample_rate'])
     num_new_frames = tf.cond(
-        tf.equal(tf.count_nonzero(memory), 0),
+        tf.equal(first_time_flag, 1),
         lambda: input_time_size,
         lambda: int(runtime_settings['clip_stride_ms'] / window_stride_ms))
+  first_time_flag = 0
   new_fingerprint_input = fingerprint_input[
       :, -num_new_frames*input_frequency_size:]
   # Expand to add input channels dimension.
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 11b732c..aa48b0c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -7680,6 +7680,80 @@
 	return scope.AddOperation(opspec)
 }
 
+// Gather slices from `params` axis `axis` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
+//
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
+//
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+//
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
+// See also `tf.batch_gather` and `tf.gather_nd`.
+//
+// Arguments:
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GatherV2",
+		Input: []tf.Input{
+			params, indices, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
+//
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeIterator",
+		Input: []tf.Input{
+			resource_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Outputs a tensor containing the reduction across all input tensors.
 //
 // Outputs a tensor containing the reduction across all input tensors passed to ops
@@ -13378,24 +13452,6 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Elementwise computes the bitwise AND of `x` and `y`.
-//
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
 // The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
@@ -24314,6 +24370,46 @@
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
+type NonDeterministicIntsAttr func(optionalAttr)
+
+// NonDeterministicIntsDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_INT64
+func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Non-deterministically generates some integers.
+//
+// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//
+// Returns Non-deterministic integer values with specified shape.
+func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonDeterministicInts",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
 type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
 
@@ -29189,6 +29285,24 @@
 	return op.Output(0), op.Output(1)
 }
 
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResizeAreaAttr is an optional argument to ResizeArea.
 type ResizeAreaAttr func(optionalAttr)
 
@@ -39065,77 +39179,3 @@
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Gather slices from `params` axis `axis` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
-//
-// ```python
-//     # Scalar indices (output is rank(params) - 1).
-//     output[a_0, ..., a_n, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
-//
-//     # Vector indices (output is rank(params)).
-//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
-//
-//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
-//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, a 0 is stored in the
-// corresponding output value.
-//
-// See also `tf.batch_gather` and `tf.gather_nd`.
-//
-// Arguments:
-//	params: The tensor from which to gather values. Must be at least rank
-// `axis + 1`.
-//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-//	axis: The axis in `params` to gather `indices` from. Defaults to the first
-// dimension. Supports negative indexes.
-//
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GatherV2",
-		Input: []tf.Input{
-			params, indices, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
-		Input: []tf.Input{
-			resource_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 84197cb..efbb7d7 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -1286,7 +1286,18 @@
             !nnapi->nnapi_exists) {
           return kTfLiteOk;
         }
-
+        // For NNAPI 1.2+, check if there is any accelerator available.
+        // If not, don't delegate to NNAPI's CPU reference implementation.
+        if (nnapi->android_sdk_version >= kMinSdkVersionForNNAPI12) {
+          uint32_t device_count = 0;
+          RETURN_TFLITE_ERROR_IF_NN_ERROR(
+              context, nnapi->ANeuralNetworks_getDeviceCount(&device_count));
+          // Any available accelerator will make the device_count larger than 1.
+          // More sophisticated check and whitelisting can be added later.
+          if (device_count <= 1) {
+            return kTfLiteOk;
+          }
+        }
         // Allocate one element in vector already since TensorFlow Lite uses
         // the first value as the number of nodes. The actual value will be set
         // later, after the vector has been filled.
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 8557bdd..82e9de3 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -65,6 +65,8 @@
       - title: "Post-training quantization example"
         path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb
         status: external
+      - title: "Delegates"
+        path: /lite/performance/delegates
       - title: "GPU delegate"
         path: /lite/performance/gpu
       - title: "Advanced GPU"
diff --git a/tensorflow/lite/g3doc/r2/convert/index.md b/tensorflow/lite/g3doc/r2/convert/index.md
new file mode 100644
index 0000000..8370039
--- /dev/null
+++ b/tensorflow/lite/g3doc/r2/convert/index.md
@@ -0,0 +1,24 @@
+# TensorFlow Lite Converter
+
+The TensorFlow Lite converter takes a TensorFlow model represented as a concrete
+function, and generates a TensorFlow Lite
+[`FlatBuffer`](https://google.github.io/flatbuffers/) file (`.tflite`).
+
+Note: This page contains documentation on the converter API for TensorFlow 2.0.
+The API for TensorFlow 1.X is available
+[here](https://www.tensorflow.org/lite/convert/).
+
+## Device Deployment
+
+The TensorFlow Lite `FlatBuffer` file is then deployed to a client device (e.g.
+mobile, embedded) and run locally using the TensorFlow Lite interpreter. This
+conversion process is shown in the diagram below:
+
+![TFLite converter workflow](../images/convert/workflow.svg)
+
+## Converting Models
+
+The TensorFlow Lite Converter can be used from the [Python API](python_api.md).
+Using the Python API makes it easier to convert models as part of a model
+development pipeline and helps mitigate
+[compatibility](../guide/ops_compatibility.md) issues early on.
diff --git a/tensorflow/lite/g3doc/r2/convert/python_api.md b/tensorflow/lite/g3doc/r2/convert/python_api.md
new file mode 100644
index 0000000..c0f8ab8
--- /dev/null
+++ b/tensorflow/lite/g3doc/r2/convert/python_api.md
@@ -0,0 +1,237 @@
+# Converter Python API guide
+
+This page provides examples on how to use the
+[TensorFlow Lite Converter](index.md) using the Python API in TensorFlow 2.0.
+
+[TOC]
+
+## Python API
+
+The Python API for converting TensorFlow models to TensorFlow Lite in TensorFlow
+2.0 is
+[`tf.lite.TFLiteConverter.from_concrete_function()`](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/lite/TFLiteConverter).
+
+This document contains [example usages](#examples) of the API, a detailed list
+of [changes in the API between 1.X and 2.0](#differences), and
+[instructions](#versioning) on running the different versions of TensorFlow.
+
+## Examples <a name="examples"></a>
+
+### Exporting a concrete function <a name="concrete_function"></a>
+
+The following example shows how to convert a TensorFlow concrete function into a
+TensorFlow Lite `FlatBuffer`.
+
+```python
+import tensorflow as tf
+
+# Construct a basic model.
+root = tf.train.Checkpoint()
+root.v1 = tf.Variable(3.)
+root.v2 = tf.Variable(2.)
+root.f = tf.function(lambda x: root.v1 * root.v2 * x)
+
+# Create the concrete function.
+input_data = tf.constant(1., shape=[1, 1])
+concrete_func = root.f.get_concrete_function(input_data)
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_concrete_function(concrete_func)
+tflite_model = converter.convert()
+```
+
+### Exporting a SavedModel <a name="saved_model"></a>
+
+The following example shows how to convert a SavedModel into a TensorFlow Lite
+`FlatBuffer`.
+
+Note: Due to a known issue with preserving input shapes with SavedModels,
+`set_shape` needs to be called for all input tensors.
+
+```python
+import tensorflow as tf
+
+# Construct a basic model.
+root = tf.train.Checkpoint()
+root.v1 = tf.Variable(3.)
+root.v2 = tf.Variable(2.)
+root.f = tf.function(lambda x: root.v1 * root.v2 * x)
+
+# Save the model.
+export_dir = "/tmp/test_saved_model"
+input_data = tf.constant(1., shape=[1, 1])
+to_save = root.f.get_concrete_function(input_data)
+tf.saved_model.save(root, export_dir, to_save)
+
+# Load model and get the concrete function.
+model = tf.saved_model.load(export_dir)
+concrete_func = model.signatures[
+  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+
+# Set the shape manually.
+concrete_func.inputs[0].set_shape(input_data.shape)
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_concrete_function(concrete_func)
+tflite_model = converter.convert()
+```
+
+### Exporting a Keras model <a name="keras"></a>
+
+The following example shows how to convert a `tf.keras` model into a TensorFlow
+Lite `FlatBuffer`.
+
+```python
+import tensorflow as tf
+
+# Create a simple Keras model.
+x = [-1, 0, 1, 2, 3, 4]
+y = [-3, -1, 1, 3, 5, 7]
+
+model = tf.keras.models.Sequential(
+    [tf.keras.layers.Dense(units=1, input_shape=[1])])
+model.compile(optimizer='sgd', loss='mean_squared_error')
+model.fit(x, y, epochs=50)
+
+# Get the concrete function from the Keras model.
+to_save = tf.function(lambda x : model(x))
+concrete_func = to_save.get_concrete_function(
+    tf.TensorSpec([None, 1], tf.float32))
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_concrete_function(concrete_func)
+tflite_model = converter.convert()
+```
+
+### End-to-end MobileNet conversion <a name="mobilenet"></a>
+
+The following example shows how to convert and run inference on a pre-trained
+`tf.Keras` MobileNet model to TensorFlow Lite. In order to load the model from
+file, use `model_path` instead of `model_content`.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+# Load the MobileNet tf.keras model.
+model = tf.keras.applications.MobileNetV2(
+    weights="imagenet", input_shape=(224, 224, 3))
+
+# Save and load the model to generate the concrete function to export.
+export_dir = "/tmp/test_model/mobilenet"
+tf.saved_model.save(model, export_dir)
+model = tf.saved_model.load(export_dir)
+concrete_func = model.signatures[
+  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+
+# Set the shape manually.
+concrete_func.inputs[0].set_shape([1, 224, 224, 3])
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_concrete_function(concrete_func)
+tflite_model = converter.convert()
+
+# Load TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_content=tflite_model)
+interpreter.allocate_tensors()
+
+# Get input and output tensors.
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+
+# Test model on random input data.
+input_shape = input_details[0]['shape']
+input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
+interpreter.set_tensor(input_details[0]['index'], input_data)
+
+interpreter.invoke()
+output_data = interpreter.get_tensor(output_details[0]['index'])
+print(output_data)
+```
+
+## Summary of changes in TFLiteConverter in 1.X and 2.0 <a name="differences"></a>
+
+The following attributes and methods associated with
+[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize)
+have been removed from `TFLiteConverter` in TensorFlow 2.0:
+
+*   `inference_type`
+*   `inference_input_type`
+*   `quantized_input_stats`
+*   `default_ranges_stats`
+*   `reorder_across_fake_quant`
+*   `change_concat_input_ranges`
+*   `post_training_quantize` - Deprecated in the 1.X API
+*   `get_input_arrays()`
+
+The rewriter function that supports quantization-aware training does not support
+models generated by TensorFlow 2.0. Additionally, TensorFlow Lite’s quantization
+API is being reworked and streamlined in a direction that supports
+quantization-aware training through the Keras API. These attributes will be
+removed in the 2.0 API until the new quantization API is launched. Users who
+want to convert models generated by the rewriter function can use
+`tensorflow.compat.v1`.
+
+The `target_ops` attribute has become an attribute of `TargetSpec` and renamed
+to `supported_ops` in line with future additions to the optimization framework.
+
+Additionally, the following attributes have been removed:
+
+*   `drop_control_dependency` (default: `True`) - Control flow is currently not
+    supported by TFLite so it is always `True`.
+*   _Graph visualization_ - The recommended approach for visualizing a
+    TensorFlow Lite graph in TensorFlow 2.0 will be to use
+    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py).
+    Unlike GraphViz, it enables users to visualize the graph after post training
+    quantization has occurred. The following attributes related to graph
+    visualization will be removed:
+    *   `output_format`
+    *   `dump_graphviz_dir`
+    *   `dump_graphviz_video`
+
+The following methods that were previously deprecated in 1.X will no longer be
+exported in 2.0:
+
+*   `lite.toco_convert`
+*   `lite.TocoConverter`
+
+If any of the changes raise concerns, please file a
+[GitHub issue](https://github.com/tensorflow/tensorflow/issues).
+
+## Installing TensorFlow <a name="versioning"></a>
+
+### Installing the TensorFlow 2.0 nightly <a name="2.0-nightly"></a>
+
+The TensorFlow 2.0 nightly can be installed using the following command:
+
+```
+pip install tf-nightly-2.0-preview
+```
+
+### Using TensorFlow 2.0 from a 1.X installation <a name="use-2.0-from-1.X"></a>
+
+TensorFlow 2.0 can be enabled from recent 1.X installations using the following
+code snippet.
+
+```python
+import tensorflow.compat.v2 as tf
+
+tf.enable_v2_behavior()
+```
+
+### Using TensorFlow 1.X from a 2.0 installation <a name="use-1.X-from-2.0"></a>
+
+TensorFlow 1.X can be enabled from 2.0 installation. This can be useful if you
+are using features that are no longer supported in 2.0.
+
+```python
+import tensorflow.compat.v1 as tf
+```
+
+### Build from source code <a name="latest_package"></a>
+
+In order to run the latest version of the TensorFlow Lite Converter Python API,
+either install the nightly build with
+[pip](https://www.tensorflow.org/install/pip) (recommended) or
+[Docker](https://www.tensorflow.org/install/docker), or
+[build the pip package from source](https://www.tensorflow.org/install/source).
diff --git a/tensorflow/lite/g3doc/r2/images/convert/workflow.svg b/tensorflow/lite/g3doc/r2/images/convert/workflow.svg
new file mode 100644
index 0000000..2d8339f
--- /dev/null
+++ b/tensorflow/lite/g3doc/r2/images/convert/workflow.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 620.0 380.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m12.700788 11.509187l317.00787 0l0 353.70078l-317.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.700788 11.509187l317.00787 0l0 353.70078l-317.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m287.5871 351.05682q0 1.0 -0.75 1.546875q-0.734375 0.53125 -2.078125 0.53125q-1.421875 0 -2.21875 -0.4375l0 -1.015625q0.515625 0.265625 1.109375 0.421875q0.59375 0.140625 1.140625 0.140625q0.84375 0 1.296875 -0.265625q0.453125 -0.265625 0.453125 -0.828125q0 -0.40625 -0.359375 -0.703125q-0.359375 -0.296875 -1.40625 -0.703125q-1.0 -0.375 -1.421875 -0.640625q-0.421875 -0.28125 -0.625 -0.625q-0.203125 -0.359375 -0.203125 -0.84375q0 -0.875 0.703125 -1.375q0.71875 -0.515625 1.953125 -0.515625q1.15625 0 2.25 0.46875l-0.375 0.875q-1.078125 -0.4375 -1.953125 -0.4375q-0.765625 0 -1.15625 0.25q-0.390625 0.234375 -0.390625 0.65625q0 0.28125 0.140625 0.484375q0.15625 0.203125 0.46875 0.390625q0.328125 0.171875 1.25 0.53125q1.28125 0.453125 1.71875 0.921875q0.453125 0.46875 0.453125 1.171875zm4.764435 2.078125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375zm3.7374573 7.265625l-2.703125 -7.125l1.15625 0l1.53125 4.21875q0.53125 1.484375 0.625 1.9375l0.046875 0q0.0625 -0.359375 0.4375 -1.4375q0.390625 -1.078125 1.71875 -4.71875l1.15625 0l-2.703125 7.125l-1.265625 0zm8.130188 0.125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m21.160105 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m21.160105 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path fill="#434343" d="m80.063835 52.13155l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm6.576172 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.1152344 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm7.1308594 -1.46875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm3.1015625 6.546875l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm7.613289 -3.21875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875076 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.3437576 0 2.1406326 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.8906326 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.9531326 0 1.4375076 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.4531326 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm11.8418045 3.21875l-1.1875 -3.765625q-0.109375 -0.34375 -0.40625 -1.578125l-0.046875 0q-0.234375 1.03125 -0.421875 1.59375l-1.203125 3.75l-1.125 0l-1.75 -6.421875l1.015625 0q0.625 2.421875 0.9375 3.6875q0.328125 1.265625 0.375 1.703125l0.046875 0q0.0625 -0.328125 0.203125 -0.859375q0.15625 -0.53125 0.265625 -0.84375l1.171875 -3.6875l1.046875 0l1.15625 3.6875q0.328125 1.0 0.4375 1.6875l0.046875 0q0.03125 -0.203125 0.125 -0.640625q0.109375 -0.453125 1.234375 -4.734375l1.0 0l-1.765625 6.421875l-1.15625 0zm12.732422 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.345703 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m343.54068 162.17848l249.0079 0l0 203.0236l-249.0079 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m343.54068 162.17848l249.0079 0l0 203.0236l-249.0079 0z" fill-rule="evenodd"/><path fill="#434343" d="m356.54068 353.1271q-1.546875 0 -2.40625 -0.953125q-0.84375 -0.953125 -0.84375 -2.6875q0 -1.796875 0.859375 -2.765625q0.859375 -0.984375 2.453125 -0.984375q0.515625 0 1.03125 0.109375q0.515625 0.109375 0.8125 0.265625l-0.328125 0.921875q-0.359375 -0.15625 -0.796875 -0.25q-0.421875 -0.09375 -0.734375 -0.09375q-2.171875 0 -2.171875 2.78125q0 1.3125 0.515625 2.015625q0.53125 0.703125 1.578125 0.703125q0.890625 0 1.828125 -0.390625l0 0.96875q-0.71875 0.359375 -1.796875 0.359375zm4.5639343 -0.125l-1.078125 0l0 -10.125l1.078125 0l0 10.125zm3.3710938 0l-1.078125 0l0 -7.125l1.078125 0l0 7.125zm-1.171875 -9.0625q0 -0.375 0.1875 -0.546875q0.1875 -0.171875 0.453125 -0.171875q0.265625 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.546875q0 0.359375 -0.1875 0.546875q-0.1875 0.171875 -0.453125 0.171875q-0.265625 0 -0.453125 -0.171875q-0.1875 -0.1875 -0.1875 -0.546875zm6.480438 9.1875q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm9.649445 6.359375l0 -4.609375q0 -0.875 -0.40625 -1.296875q-0.390625 -0.4375 -1.234375 -0.4375q-1.125 0 -1.65625 0.609375q-0.515625 0.59375 -0.515625 2.0l0 3.734375l-1.078125 0l0 -7.125l0.890625 0l0.171875 0.96875l0.046875 0q0.328125 -0.53125 0.921875 -0.8125q0.609375 -0.296875 1.34375 -0.296875q1.296875 0 1.9375 0.625q0.65625 0.625 0.65625 1.984375l0 4.65625l-1.078125 0zm5.6022644 -0.765625q0.28125 0 0.546875 -0.03125q0.265625 -0.046875 0.421875 -0.09375l0 0.828125q-0.171875 0.078125 -0.515625 0.125q-0.34375 0.0625 -0.609375 0.0625q-2.078125 0 -2.078125 -2.171875l0 -4.25l-1.015625 0l0 -0.515625l1.015625 -0.453125l0.453125 -1.515625l0.625 0l0 1.65625l2.078125 0l0 0.828125l-2.078125 0l0 4.203125q0 0.640625 0.3125 0.984375q0.3125 0.34375 0.84375 0.34375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m78.215225 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m78.215225 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m106.46312 310.84216l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437546 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859421 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406296 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343796 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m98.353676 324.24844q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.593796 0q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640663 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.37499237 0.21875 -0.6249924 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.48436737 -0.296875 1.0468674 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm6.343796 3.796875q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m230.91296 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m230.91296 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m259.16086 310.84216l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437561 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.8594055 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406311 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.3437805 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.937561 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m245.59827 324.8422l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm6.015671 0l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0zm7.968796 -5.609375q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.718811 0.03125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.8594055 0.625q-0.484375 0 -0.90625 -0.21875q-0.421875 -0.21875 -0.703125 -0.625l-0.3125 0.71875l-0.546875 0l0 -7.984375l0.984375 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.8125q0.265625 -0.453125 0.71875 -0.703125q0.453125 -0.265625 0.921875 -0.265625q1.03125 0 1.640625 0.71875q0.609375 0.71875 0.609375 2.09375q0 0.9375 -0.328125 1.609375q-0.3125 0.65625 -0.84375 0.984375q-0.53125 0.328125 -1.125 0.328125zm-0.109375 -0.765625q0.65625 0 1.078125 -0.5q0.4375 -0.515625 0.4375 -1.609375q0 -1.046875 -0.40625 -1.578125q-0.390625 -0.546875 -1.078125 -0.546875q-0.671875 0 -1.09375 0.609375q-0.421875 0.59375 -0.421875 1.546875q0 2.078125 1.484375 2.078125zm5.578186 0.765625q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm8.5625305 -6.78125l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm6.015686 -0.015625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm4.0625305 6.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m359.81628 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.81628 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.06418 310.84216l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.4375305 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859436 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.4062805 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343811 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m371.76718 324.8422l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm3.8750305 1.15625l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.468811 4.984375q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm1.7344055 -1.1875l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm8.015686 -3.71875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m318.40903 313.32217l41.417328 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m318.40903 313.32217l37.990265 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m356.39926 313.32217l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#c9daf8" d="m495.33072 255.95735l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 255.95735l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m531.6832 271.48904l2.90625 0l0 4.15625q-0.6875 0.21875 -1.390625 0.328125q-0.703125 0.125 -1.625 0.125q-1.9375 0 -3.03125 -1.15625q-1.078125 -1.171875 -1.078125 -3.25q0 -1.34375 0.53125 -2.34375q0.546875 -1.0 1.546875 -1.53125q1.015625 -0.53125 2.359375 -0.53125q1.375 0 2.5625 0.5l-0.390625 0.875q-1.15625 -0.484375 -2.234375 -0.484375q-1.5625 0 -2.453125 0.9375q-0.875 0.921875 -0.875 2.578125q0 1.734375 0.84375 2.640625q0.859375 0.890625 2.5 0.890625q0.890625 0 1.734375 -0.21875l0 -2.625l-1.90625 0l0 -0.890625zm10.392578 -1.59375q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 297.8261l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m532.15686 310.04532q-1.40625 0 -2.234375 0.9375q-0.8125 0.9375 -0.8125 2.578125q0 1.671875 0.78125 2.59375q0.796875 0.921875 2.25 0.921875q0.90625 0 2.046875 -0.328125l0 0.875q-0.890625 0.34375 -2.1875 0.34375q-1.890625 0 -2.921875 -1.15625q-1.03125 -1.15625 -1.03125 -3.265625q0 -1.328125 0.484375 -2.3125q0.5 -1.0 1.4375 -1.53125q0.9375 -0.546875 2.203125 -0.546875q1.34375 0 2.359375 0.484375l-0.421875 0.859375q-0.984375 -0.453125 -1.953125 -0.453125zm9.3359375 1.71875q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m165.71129 313.32217l65.19685 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m165.71129 313.32217l61.76976 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m227.48105 313.32217l-1.124588 1.1245728l3.0897675 -1.1245728l-3.0897675 -1.1245728z" fill-rule="evenodd"/><path fill="#d9ead3" d="m359.81628 119.71654l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.81628 119.71654l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m384.31497 114.11024l77.480316 0l0 31.748024l-77.480316 0z" fill-rule="evenodd"/><path fill="#000000" d="m401.3306 130.27086q0 2.109375 -1.15625 3.234375q-1.140625 1.125 -3.3125 1.125l-2.375 0l0 -8.5625l2.625 0q2.0 0 3.109375 1.109375q1.109375 1.09375 1.109375 3.09375zm-1.046875 0.03125q0 -1.671875 -0.84375 -2.515625q-0.84375 -0.859375 -2.5 -0.859375l-1.453125 0l0 6.84375l1.21875 0q1.78125 0 2.671875 -0.875q0.90625 -0.875 0.90625 -2.59375zm6.763672 4.328125l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm6.111328 0.6875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm10.822266 0.6875l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm2.8417969 -6.421875l1.046875 0l1.40625 3.65625q0.453125 1.265625 0.5625 1.8125l0.046875 0q0.078125 -0.296875 0.3125 -1.015625q0.25 -0.734375 1.609375 -4.453125l1.03125 0l-2.75 7.3125q-0.421875 1.078125 -0.96875 1.53125q-0.546875 0.46875 -1.34375 0.46875q-0.4375 0 -0.875 -0.109375l0 -0.78125q0.328125 0.078125 0.71875 0.078125q1.0 0 1.4375 -1.125l0.359375 -0.921875l-2.59375 -6.453125zm10.046875 6.546875q-0.625 0 -1.140625 -0.234375q-0.515625 -0.234375 -0.875 -0.71875l-0.0625 0q0.0625 0.5625 0.0625 1.0625l0 2.65625l-0.96875 0l0 -9.3125l0.796875 0l0.125 0.875l0.046875 0q0.375 -0.53125 0.875 -0.765625q0.5 -0.234375 1.140625 -0.234375q1.28125 0 1.96875 0.875q0.703125 0.875 0.703125 2.453125q0 1.578125 -0.703125 2.46875q-0.703125 0.875 -1.96875 0.875zm-0.140625 -5.84375q-0.984375 0 -1.421875 0.546875q-0.4375 0.546875 -0.453125 1.734375l0 0.21875q0 1.359375 0.453125 1.9375q0.453125 0.578125 1.453125 0.578125q0.828125 0 1.296875 -0.671875q0.46875 -0.671875 0.46875 -1.859375q0 -1.203125 -0.46875 -1.84375q-0.46875 -0.640625 -1.328125 -0.640625zm7.2285156 5.84375q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m466.6929 119.71654l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m466.6929 119.71654l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m491.1916 114.11024l100.0 0l0 31.748024l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m501.36346 134.63023l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.595703 0l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm5.8652344 -5.671875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm4.1132812 -0.875q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm5.9140625 6.546875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm9.064453 -1.0625q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm3.6621094 1.0625q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm5.095703 -5.859375q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm2.8828125 0.125l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.005859 6.546875q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm5.2285156 -0.8125q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm3.0800781 -5.734375l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.380859 -0.125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm4.6796875 6.671875q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#d9ead3" d="m27.741055 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m27.741055 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m60.082745 162.6372l0.984375 0l0 0.078125q-0.0625 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.671875l2.9375 0l0 -3.1875l0.9843712 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.953125l-0.8906212 0l0 -3.5625l-2.921875 0l0 3.5625l-0.875 0l0 -7.46875zm6.046917 0l1.75 0q0.921875 0 1.453125 0.25q0.546875 0.234375 0.9375 0.765625q0.734375 0.984375 0.734375 2.75q-0.0625 1.8125 -0.84375 2.78125q-0.765625 0.953125 -2.421875 0.9375l-1.609375 0l0 -7.484375zm1.5625 6.828125q2.484375 0 2.484375 -3.0q-0.015625 -1.53125 -0.578125 -2.328125q-0.546875 -0.796875 -1.765625 -0.796875l-0.90625 0l0 6.125l0.765625 0zm4.734421 0.640625l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm8.140671 -4.859375q0.65625 0 1.15625 0.3125q0.515625 0.296875 0.8125 0.875q0.296875 0.5625 0.296875 1.3125q0 0.765625 -0.3125 1.328125q-0.296875 0.5625 -0.84375 0.859375q-0.53125 0.296875 -1.203125 0.296875q-0.6875 0 -1.265625 -0.296875q-0.578125 -0.296875 -0.953125 -0.84375l0.671875 -0.515625l0.015625 0q0.015625 0 0.015625 0q0 0 0 0q0.3125 0.484375 0.65625 0.71875q0.34375 0.21875 0.90625 0.21875q0.390625 0 0.71875 -0.21875q0.34375 -0.234375 0.53125 -0.625q0.203125 -0.40625 0.203125 -0.953125q0 -0.8125 -0.4375 -1.28125q-0.4375 -0.484375 -1.09375 -0.484375q-0.390625 0 -0.765625 0.1875q-0.359375 0.171875 -0.640625 0.515625l-0.53125 -0.21875l0.25 -3.796875l3.796875 0l0 0.75l-3.078125 0l-0.125 2.140625q0.59375 -0.28125 1.21875 -0.28125zm-1.625 3.328125q-0.0625 -0.09375 0.015625 -0.015625l-0.015625 0.015625zm0.125 0q0 0.0625 -0.109375 -0.015625l0.0625 -0.046875l0.046875 0.0625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m26.839895 63.718502l87.49606 0l0 30.99213l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m26.839895 63.718502l87.49606 0l0 30.99213l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m51.915867 83.234566q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.1250496 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m127.456696 63.71982l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.456696 63.71982l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m137.97523 83.73588l1.8125 -8.5625l1.0 0l-1.625 7.65625l3.3125 0l-0.1875 0.90625l-4.3125 0zm8.955078 -6.5q1.109375 0 1.75 0.65625q0.65625 0.65625 0.65625 1.8125q0 1.09375 -0.421875 2.078125q-0.421875 0.984375 -1.140625 1.515625q-0.71875 0.53125 -1.625 0.53125q-1.125 0 -1.765625 -0.65625q-0.640625 -0.671875 -0.640625 -1.8125q0 -1.125 0.421875 -2.09375q0.4375 -0.984375 1.15625 -1.5q0.734375 -0.53125 1.609375 -0.53125zm1.390625 2.328125q0 -0.671875 -0.375 -1.078125q-0.359375 -0.421875 -0.984375 -0.421875q-0.640625 0 -1.15625 0.4375q-0.5 0.421875 -0.78125 1.203125q-0.28125 0.765625 -0.28125 1.703125q0 0.765625 0.375 1.1875q0.375 0.421875 1.078125 0.421875q0.609375 0 1.09375 -0.421875q0.484375 -0.4375 0.75 -1.21875q0.28125 -0.796875 0.28125 -1.8125zm6.451172 4.171875l-0.1875 -3.53125q-0.03125 -0.546875 -0.03125 -1.0l0 -0.921875l-0.046875 0l-0.296875 0.6875l-0.484375 1.109375l-1.703125 3.65625l-1.1875 0l-0.25 -6.421875l0.953125 0l0.109375 3.484375l0 0.515625q0 0.859375 -0.046875 1.578125l0.03125 0q0.28125 -0.734375 0.8125 -1.890625l1.71875 -3.6875l1.078125 0l0.21875 3.484375q0.03125 0.984375 0.03125 1.53125l0 0.3125l-0.015625 0.25l0.03125 0q0.171875 -0.515625 0.484375 -1.28125q0.328125 -0.78125 1.90625 -4.296875l1.03125 0l-2.953125 6.421875l-1.203125 0zm8.15625 0l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm4.625 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.734375 -6.421875l0.984375 0l0.375 3.59375q0.140625 1.515625 0.140625 2.125l0.03125 0q0.75 -1.625 1.046875 -2.1875l1.90625 -3.53125l1.046875 0l-3.46875 6.421875l-1.328125 0zm7.1210938 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm9.6171875 -2.71875l-2.90625 0l-1.4375 2.71875l-1.109375 0l4.6875 -8.5625l1.015625 0l1.078125 8.5625l-1.0 0l-0.328125 -2.71875zm-0.109375 -0.921875l-0.203125 -1.75q-0.140625 -1.046875 -0.171875 -2.046875q-0.21875 0.515625 -0.46875 1.03125q-0.25 0.5 -1.46875 2.765625l2.3125 0zm8.802734 -2.71875q0 1.453125 -0.96875 2.21875q-0.953125 0.765625 -2.8125 0.765625l-0.796875 0l-0.71875 3.375l-1.0 0l1.8125 -8.5625l1.9375 0q1.25 0 1.890625 0.5625q0.65625 0.546875 0.65625 1.640625zm-4.390625 2.140625l0.78125 0q1.265625 0 1.921875 -0.53125q0.65625 -0.546875 0.65625 -1.578125q0 -0.734375 -0.40625 -1.046875q-0.40625 -0.328125 -1.25 -0.328125l-0.96875 0l-0.734375 3.484375zm4.8847656 4.21875l1.828125 -8.5625l0.984375 0l-1.828125 8.5625l-0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 214.08858l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 214.08858l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m527.26624 234.10464l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm9.046875 0l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm10.8515625 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 172.21982l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 172.21982l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m535.1793 191.61089q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m71.48908 181.08202l0 27.68892l50.58268 0l0 27.681168" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m71.48908 181.08202l0 27.68892l50.582672 0l0 24.254074" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.071754 233.02501l-1.1245804 -1.1245728l1.1245804 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 313.32217l24.009003 0l0 0.06298828l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 313.32217l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 313.38516l-1.1245728 1.1246033l3.0897522 -1.1246033l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 313.32217l24.009003 0l0 -41.858276l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 313.32217l24.009003 0l0 -41.858276l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 271.4639l-1.1245728 1.1246033l3.0897522 -1.1246033l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 313.32217l24.009003 0l0 -83.74803l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 313.32217l24.009003 0l0 -83.74803l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 229.57414l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 313.32217l24.009003 0l0 -125.60629l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 313.32217l24.009003 0l0 -125.60629l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 187.71588l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#d9ead3" d="m128.68855 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m128.68855 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m146.01448 165.98096q1.0625 0.4375 1.46875 0.90625q0.40625 0.46875 0.40625 1.171875q0 0.5625 -0.265625 1.0625q-0.265625 0.484375 -0.828125 0.796875q-0.5625 0.3125 -1.390625 0.3125q-1.453125 0 -2.34375 -0.953125l0.4375 -0.75l0 -0.015625q0 0 0 0.015625q0 0 0 0q0.328125 0.421875 0.828125 0.6875q0.515625 0.25 1.1875 0.25q0.671875 0 1.109375 -0.359375q0.4375 -0.375 0.4375 -0.921875q0 -0.34375 -0.140625 -0.578125q-0.125 -0.234375 -0.484375 -0.453125q-0.359375 -0.234375 -1.078125 -0.546875q-1.109375 -0.4375 -1.59375 -0.984375q-0.46875 -0.5625 -0.46875 -1.234375q0 -0.84375 0.609375 -1.34375q0.625 -0.515625 1.671875 -0.515625q0.609375 0 1.140625 0.25q0.546875 0.25 0.9375 0.6875l-0.46875 0.625l-0.015625 0.015625q-0.359375 -0.484375 -0.75 -0.671875q-0.390625 -0.203125 -0.96875 -0.203125q-0.578125 0 -0.9375 0.328125q-0.34375 0.3125 -0.34375 0.765625q0 0.34375 0.140625 0.609375q0.15625 0.25 0.546875 0.5q0.390625 0.25 1.15625 0.546875zm1.03125 -1.84375q0 -0.046875 0.046875 -0.015625q0.046875 0.015625 0.0625 0.015625l-0.03125 0.046875l-0.078125 -0.046875l0 0zm0.125 -0.03125q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm-3.546875 4.375q0 0.03125 -0.046875 0.015625q-0.046875 -0.03125 -0.0625 -0.03125l0.03125 -0.046875l0.078125 0.046875l0 0.015625zm-0.125 0.03125q-0.078125 -0.09375 0.015625 -0.046875l-0.015625 0.046875zm7.859421 -4.015625q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.843796 -4.953125q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm4.343796 3.40625q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm3.5469208 0.640625l0 -7.46875l0.671875 0l1.84375 3.65625l1.890625 -3.671875l0.625 0l0 7.484375l-0.78125 0l0 -5.640625l-1.625 3.015625l-0.328125 0l-1.515625 -2.984375l0 5.609375l-0.78125 0zm8.515671 0.09375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m171.20473 94.711945l1.2283478 55.37007" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m171.20473 94.711945l1.1523285 51.943832" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m172.35706 146.65578l-1.1492462 -1.0993652l1.1928406 3.0640717l1.0557709 -3.1139526z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m172.43658 181.08202l0 27.68892l-50.362198 0l0 27.681168" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m172.4366 181.08202l0 27.68892l-50.362213 0l0 24.254074" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.07439 233.02501l-1.124588 -1.1245728l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m70.58793 94.71063l0.9133835 55.37007" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m70.58793 94.71063l0.8568573 51.94345" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m71.444786 146.65408l-1.1429749 -1.1058807l1.1753922 3.0708008l1.0734634 -3.107895z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m95.068245 94.66404l0 27.763779l54.01574 0l0 27.763786" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m95.068245 94.66404l0 27.763779l54.01574 0l0 24.336693" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m149.08398 146.76451l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#d9ead3" d="m78.32767 236.45998l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m78.32767 236.45998l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m101.46614 248.8823q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.609421 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.171921 -0.3125l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.281296 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859421 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m99.013016 263.47604l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm7.718796 0.125q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm4.203171 -5.609375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.609421 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.390671 3.53125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm1.8750458 0.5l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm6.015671 6.734375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m122.0757 267.4521l-0.12598419 30.362213" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m122.0757 267.4521l-0.11177063 26.93515" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m121.96393 294.38724l-1.1199036 -1.129242l1.1117554 3.0944214l1.1373901 -3.085083z" fill-rule="evenodd"/></g></svg>
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 75a23de..6888183 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -19,12 +19,14 @@
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/nnapi_delegate.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -55,6 +57,13 @@
 Interpreter::Interpreter(ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
+  // Only log initialization once per-process to avoid log spam.
+  static std::once_flag init_log_once_flag;
+  std::call_once(init_log_once_flag, []() {
+    // TODO(b/128420794): Include the TFLite runtime version in the log.
+    TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Initialized TensorFlow Lite runtime.");
+  });
+
   // There's always at least 1 subgraph which is the primary subgraph.
   AddSubgraphs(1);
   context_ = primary_subgraph().context();
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index e1aedfe..7c76b2d 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -14,6 +14,7 @@
 ==============================================================================*/
 
 #include "tensorflow/lite/interpreter.h"
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -51,13 +52,25 @@
 }  // namespace ops
 namespace {
 
+using ::testing::IsEmpty;
+
 // Make an interpreter that has no tensors and no nodes
 TEST(BasicInterpreter, ZeroInterpreter) {
+  testing::internal::CaptureStderr();
+
   Interpreter interpreter;
+  EXPECT_THAT(testing::internal::GetCapturedStderr(),
+              testing::HasSubstr("INFO: Initialized TensorFlow Lite runtime"));
+
   interpreter.SetInputs({});
   interpreter.SetOutputs({});
   ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
   ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+
+  // Creating a new interpreter should not redundantly log runtime init.
+  testing::internal::CaptureStderr();
+  Interpreter interpreter2;
+  EXPECT_THAT(testing::internal::GetCapturedStderr(), IsEmpty());
 }
 
 // Test various error conditions.
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index d492419..e49348a 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -129,6 +129,9 @@
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
+    case kTfLiteBool:
+      TF_LITE_COMPARISON(bool, Equal, requires_broadcast);
+      break;
     case kTfLiteFloat32:
       TF_LITE_COMPARISON(float, Equal, requires_broadcast);
       break;
@@ -147,9 +150,9 @@
                                  requires_broadcast);
       break;
     default:
-      context->ReportError(context,
-                           "Does not support type %d, requires float|int|uint8",
-                           input1->type);
+      context->ReportError(
+          context, "Does not support type %d, requires bool|float|int|uint8",
+          input1->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -162,6 +165,9 @@
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
+    case kTfLiteBool:
+      TF_LITE_COMPARISON(bool, NotEqual, requires_broadcast);
+      break;
     case kTfLiteFloat32:
       TF_LITE_COMPARISON(float, NotEqual, requires_broadcast);
       break;
@@ -180,9 +186,9 @@
                                     requires_broadcast);
       break;
     default:
-      context->ReportError(context,
-                           "Does not support type %d, requires float|int|uint8",
-                           input1->type);
+      context->ReportError(
+          context, "Does not support type %d, requires bool|float|int|uint8",
+          input1->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 6ec1f09..3f950a3 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -92,6 +92,17 @@
   }
 };
 
+TEST(ComparisonsTest, EqualBool) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_BOOL,
+                          BuiltinOperator_EQUAL);
+  model.PopulateTensor<bool>(model.input1(), {true, false, true, false});
+  model.PopulateTensor<bool>(model.input2(), {true, true, false, false});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(ComparisonsTest, EqualFloat) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
                           BuiltinOperator_EQUAL);
@@ -137,6 +148,17 @@
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
+TEST(ComparisonsTest, NotEqualBool) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_BOOL,
+                          BuiltinOperator_NOT_EQUAL);
+  model.PopulateTensor<bool>(model.input1(), {true, false, true, false});
+  model.PopulateTensor<bool>(model.input2(), {true, true, false, false});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(ComparisonsTest, NotEqualFloat) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
                           BuiltinOperator_NOT_EQUAL);
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 1046860..93006fa 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -251,6 +251,7 @@
         ":optimized_base",
         ":tensor",
         ":types",
+        "//tensorflow/core/kernels:eigen_spatial_convolutions-inl",
         "//tensorflow/lite/c:c_api_internal",
         "//third_party/eigen3",
     ],
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
index 8d9ad16..3e48d95 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -145,10 +145,29 @@
       break;
 #endif
     }
-    case DepthwiseConvImplementation::kUseNeon3x3DotProduct:
-    case DepthwiseConvImplementation::kUseUnwound3x3DotProduct:
-      // TODO(b/118426582) Placeholder for future dispatches.
+    case DepthwiseConvImplementation::kUseNeon3x3DotProduct: {
+#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)
+      DotProduct3x3KernelType kernel_type =
+          optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+              input_shape, filter_shape, params);
+
+      ASSERT_TRUE(
+          kernel_type == DotProduct3x3KernelType::kPlain ||
+          kernel_type == DotProduct3x3KernelType::kStride2 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride2)
+          << "Kernel type = " << static_cast<int>(kernel_type);
+
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
+#endif
       break;
+    }
     case DepthwiseConvImplementation::kUseCModel3x3DotProduct: {
       DotProduct3x3KernelType kernel_type =
           optimized_ops::depthwise_conv::CategorizeDotProductKernel(
@@ -181,17 +200,36 @@
           bias_shape, bias_data, output_shape, output_data);
       return;
     }
+    case DepthwiseConvImplementation::kUseUnwound3x3DotProduct: {
+      DotProduct3x3KernelType kernel_type =
+          optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+              input_shape, filter_shape, params);
+      ASSERT_TRUE(
+          kernel_type == DotProduct3x3KernelType::kPlain ||
+          kernel_type == DotProduct3x3KernelType::kStride2 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseUnwound3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
+    }
     case DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct: {
 #if defined(USE_NEON)
-      using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
       DotProduct3x3KernelType kernel_type =
           optimized_ops::depthwise_conv::CategorizeDotProductKernel(
               input_shape, filter_shape, params);
 
-      ASSERT_TRUE(kernel_type == DotProduct3x3KernelType::kPlain ||
-                  kernel_type == DotProduct3x3KernelType::kStride2 ||
-                  kernel_type ==
-                      DotProduct3x3KernelType::kWithDepthMultiplicationStride1);
+      ASSERT_TRUE(
+          kernel_type == DotProduct3x3KernelType::kPlain ||
+          kernel_type == DotProduct3x3KernelType::kStride2 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride2);
       optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
           DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct>(
           params, input_shape, input_data, filter_shape, filter_data,
@@ -744,6 +782,20 @@
         ),
     TestParam::TestNameSuffix);
 
+INSTANTIATE_TEST_SUITE_P(
+    Unwound, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseUnwound3x3DotProduct),          // forced_invocation
+        Values(1000),                                  // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+
 #if defined(USE_NEON)
 INSTANTIATE_TEST_SUITE_P(
     Intrinsics, DepthwiseConvTest,
@@ -753,23 +805,25 @@
         Values(1000),                                  // tests_to_run
         Bool(),                                        // test_stride
         Bool(),                                        // test_pad
-        Values(false),                                 // test_depth_multiplier
+        Bool(),                                        // test_depth_multiplier
         Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
         Values(kLooseIntrinsicsTolerance)              // loose_tolerance
         ),
     TestParam::TestNameSuffix);
+#endif
 
+#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)
 INSTANTIATE_TEST_SUITE_P(
-    IntrinsicsAlt, DepthwiseConvTest,
+    NeonAsm, DepthwiseConvTest,
     testing::Combine(
         Values(DepthwiseConvImplementation::
-                   kUseIntrinsics3x3DotProduct),       // forced_invocation
+                   kUseNeon3x3DotProduct),             // forced_invocation
         Values(1000),                                  // tests_to_run
-        Values(false),                                 // test_stride
-        Values(false),                                 // test_pad
-        Values(true),                                  // test_depth_multiplier
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
         Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
-        Values(kLooseIntrinsicsTolerance)              // loose_tolerance
+        Values(false)                                  // loose_tolerance
         ),
     TestParam::TestNameSuffix);
 #endif
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 39c01b7..50de905 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -34,6 +34,182 @@
 // to allow for this.
 constexpr int kWorkspaceExtension = 16;
 
+#ifdef USE_NEON
+// Lane operations are for clarity and convenience. We want to load and store
+// 4 8-bit lanes together. So these are treated much like 32-bit loads and
+// 32-bit stores. Stores require 32-bit alignment.
+
+#define vst1_lane_8x4(dst, reg, lane_num)                         \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
+  vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
+#define vst1q_lane_8x4(dst, reg, lane_num)                        \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
+  vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
+
+#define vld1q_lane_s8x8(src, reg, lane_num) \
+  vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), reg, lane_num)
+#define vld1_lane_8x4(src, reg, lane_num) \
+  vld1_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
+#define vld1q_lane_8x4(src, reg, lane_num) \
+  vld1q_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
+#define vld1q_dup_s8x4(src) vld1q_dup_s32(reinterpret_cast<const int32*>(src))
+
+#ifndef __aarch64__
+inline int8x16_t vqtbl4q_s8(int8x16x4_t a, uint8x16_t b) {
+  const uint8x16_t mask = vtstq_u8(b, vdupq_n_u8(8));
+
+  // Delete bit 3 from the indices.
+  const uint8x16_t high_bits = vshrq_n_u8(b, 4);
+  uint8x16_t deleted_bit_3 = b;
+  deleted_bit_3 = vsliq_n_u8(deleted_bit_3, high_bits, 3);
+
+  int8x8x4_t repacked_data;
+
+  // Calculate for lower indices.
+  repacked_data.val[0] = vget_low_u8(a.val[0]);
+  repacked_data.val[1] = vget_low_u8(a.val[1]);
+  repacked_data.val[2] = vget_low_u8(a.val[2]);
+  repacked_data.val[3] = vget_low_u8(a.val[3]);
+  const int8x16_t output_for_lower =
+      vcombine_u8(vtbl4_s8(repacked_data, vget_low_u8(deleted_bit_3)),
+                  vtbl4_s8(repacked_data, vget_high_u8(deleted_bit_3)));
+
+  // Calculate for high indices.
+  repacked_data.val[0] = vget_high_u8(a.val[0]);
+  repacked_data.val[1] = vget_high_u8(a.val[1]);
+  repacked_data.val[2] = vget_high_u8(a.val[2]);
+  repacked_data.val[3] = vget_high_u8(a.val[3]);
+  const int8x16_t output_for_higher =
+      vcombine_u8(vtbl4_s8(repacked_data, vget_low_u8(deleted_bit_3)),
+                  vtbl4_s8(repacked_data, vget_high_u8(deleted_bit_3)));
+
+  // Merge.
+  int8x16_t output = mask;
+  output = vbslq_u8(output, output_for_higher, output_for_lower);
+  return output;
+}
+#endif  // !__aarch64__
+
+// Convenience-compatibility functions.
+// Compatibility: Intrinsics reflect a mixture of older and newer ARM
+//     instructions. This actually results in ZIP1 / ZIP2 asm instructions, but
+//     one intrinsic is provided. Also older instructions operated in place,
+//     and it seems more defensive to assume that some versions of intrinsics
+//     might reflect this
+// Convenience: Callers in these kernels want both ZIP1 and ZIP2, and we do not
+//     want the calling code to get cluttered with unpacking int8x16x2_t.
+inline void vzipq_s8_in_place(int8x16_t* a, int8x16_t* b) {
+  int8x16x2_t r8x16;
+  r8x16 = vzipq_s8(*a, *b);
+  *a = r8x16.val[0];
+  *b = r8x16.val[1];
+}
+
+inline void vzipq_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
+  int16x8x2_t r16x8;
+  r16x8 = vzipq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
+  *a = vreinterpretq_s8_s16(r16x8.val[0]);
+  *b = vreinterpretq_s8_s16(r16x8.val[1]);
+}
+
+// Similar rationale to the zip-in_place functions, but callers only actually
+// need the TRN1 asm instruction result.
+inline void vtrn1_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
+  int16x8x2_t r16x8;
+  r16x8 = vtrnq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
+  *a = vreinterpretq_s8_s16(r16x8.val[0]);
+}
+
+inline void biregister_rotate_8(int8x16_t* left, int8x16_t* right) {
+  *left = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*left), 8));
+  *left = vreinterpretq_s8_u32(vsliq_n_u32(vreinterpretq_u32_s8(*left),
+                                           vreinterpretq_u32_s8(*right), 24));
+  *right = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*right), 8));
+}
+
+#ifndef __aarch64__
+inline int32x4_t vpaddq_s32(int32x4_t a, int8x16_t b) {
+  int32x4x2_t deinterleaved = vuzpq_s32(a, b);
+  return vqaddq_s32(deinterleaved.val[0], deinterleaved.val[1]);
+}
+#endif  // !__aarch64__
+
+#ifdef __ARM_FEATURE_DOTPROD
+// The vdotq_lane_s32 takes int8x8t for the rhs parameter, whereas the actual
+// instruction selects from between 4 32-bit (4x8-bit packed) sub-registers, an
+// unusual interpretation of "lane".
+inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
+                                     int8x16_t rhs, const int lane) {
+  switch (lane) {
+    case 0:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 0);
+    case 1:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 1);
+    case 2:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
+                            0);
+    case 3:
+    default:
+      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
+                            1);
+  }
+}
+
+#else
+
+inline int32x4_t vdotq_s32(int32x4_t acc, int8x16_t lhs, int8x16_t rhs) {
+  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs)));
+  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs)));
+  int32x4_t sum = vpaddq_s32(sum0, sum1);
+  return vaddq_s32(acc, sum);
+}
+
+inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
+                                     int8x16_t rhs, int lane) {
+  int8x8_t lane_rhs;
+  if (lane == 0) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 0));
+  } else if (lane == 1) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 1));
+  } else if (lane == 2) {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 0));
+  } else {
+    lane_rhs = vreinterpret_s8_s32(
+        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 1));
+  }
+  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), lane_rhs));
+  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), lane_rhs));
+  int32x4_t sum = vpaddq_s32(sum0, sum1);
+  return vaddq_s32(acc, sum);
+}
+
+#endif  // !DOTPROD
+#endif  // ARM NEON
+
+template <DepthwiseConvOutputRounding output_rounding>
+struct DivideByPOT {};
+
+template <>
+struct DivideByPOT<DepthwiseConvOutputRounding::kAwayFromZero> {
+  template <typename IntegerType>
+  static inline IntegerType Run(IntegerType x, int exponent) {
+    return RoundingDivideByPOT(x, exponent);
+  }
+};
+
+#ifdef USE_NEON
+template <>
+struct DivideByPOT<DepthwiseConvOutputRounding::kUpward> {
+  template <typename IntegerType>
+  static inline IntegerType Run(IntegerType x, int exponent) {
+    return vqrshlq_s32(x, vdupq_n_s32(static_cast<int32>(-exponent)));
+  }
+};
+#endif  // ARM NEON
+
 // See CategorizeDotProductKernel for definitive taxonomy.
 enum class DotProduct3x3KernelType {
   kNone = 0,  // Parameter combination is not supported for dot product kernels.
@@ -86,7 +262,7 @@
   }
 }
 
-#if defined(USE_NEON)
+#ifdef USE_NEON
 
 #define STR(s) STR_UNEXPANDED(s)
 #define STR_UNEXPANDED(s) #s
@@ -234,7 +410,7 @@
   int32 four_over_stride;
 };
 
-#if defined(USE_NEON)
+#ifdef USE_NEON
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
 template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight>
 struct DepthwiseConvWindow {};
@@ -297,7 +473,8 @@
         // Set "constant" registers. These registers may be replaced with temp
         // values from time to time when there are not enough NEON registers.
         // We use x9--x15 general purpose registers as they are caller-saved
-        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        // temporary registers (see
+        // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
         "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
         "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
         "cmp %w[output_window_height], #2\n"
@@ -3576,6 +3753,3216 @@
   // implementation rather than conforming to style.
 };
 
+#if defined(USE_NEON) && defined(__aarch64__)
+// Experiments suggest that a modest performance improvement is seen, at least
+// on 855 chipset big cores, with cache hints.
+inline void PreloadInputBlock(
+    const uint8* input_block_data,
+    const DepthwiseConvDotProdParams* function_params) {
+  // Preload.
+  const int input_width_micro_repeats =
+      function_params->input_width_micro_repeats;
+  const int block_height = function_params->inbound_block_height;
+  const int residual_width = function_params->residual_width;
+  const int input_height_stride = function_params->input_height_stride;
+  const int input_depth = function_params->input_depth;
+
+  {
+    const int total_width = 4 * input_width_micro_repeats + residual_width;
+    const uint8* row_ptr = input_block_data;
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      const uint8* ptr = row_ptr;
+      for (int j = 0; j < total_width; ++j) {
+        // Input data is loaded once.
+        asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+        ptr += input_depth;
+      }
+      row_ptr += input_height_stride;
+    }
+  }
+}
+
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
+  static void ProcessPerDepthIntrinsics(
+      const uint8* filter_data, const int32* bias_data,
+      int8* shuffled_filter_data, int32* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int depth = function_params->output_depth;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int bias_increment = function_params->bias_increment;
+
+    constexpr int kSymmetricZeroPoint = 128;
+    constexpr uint8 kSignBit = 0x80;
+    const int32 input_offset = function_params->input_offset;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+    const int8x16_t ones_vector = vdupq_n_s8(1);
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8x16_t filter_reg_0_a;
+    int8x16_t filter_reg_0_b;
+    int8x16_t filter_reg_1_a;
+    int8x16_t filter_reg_1_b;
+    int8x16_t filter_reg_2_a;
+    int8x16_t filter_reg_2_b;
+
+    // Register pairs for each height.
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+
+    const uint8* filter_block = filter_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Filter data is provided as filter_block[3][3][depth/8][2][4].
+      // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
+      // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
+
+      // Load zero-point into effective position of zero-padding of filter
+      // (register B, upper part).
+      filter_reg_0_b = vdupq_n_u8(kSignBit);
+      filter_reg_1_b = vdupq_n_u8(kSignBit);
+      filter_reg_2_b = vdupq_n_u8(kSignBit);
+
+      const uint8* filter_block_ptr = filter_block;
+      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 0);
+      filter_block_ptr += depth;
+      filter_reg_0_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_b, 0);
+      filter_block_ptr += depth;
+      filter_reg_0_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_0_a, 1);
+      filter_block_ptr += depth;
+      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 0);
+      filter_block_ptr += depth;
+      filter_reg_1_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_b, 0);
+      filter_block_ptr += depth;
+      filter_reg_1_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_1_a, 1);
+      filter_block_ptr += depth;
+      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 0);
+      filter_block_ptr += depth;
+      filter_reg_2_b = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_b, 0);
+      filter_block_ptr += depth;
+      filter_reg_2_a = vld1q_lane_s8x8(filter_block_ptr, filter_reg_2_a, 1);
+
+      filter_reg_0_a = veorq_s8(filter_reg_0_a, sign_bit);
+      filter_reg_0_b = veorq_s8(filter_reg_0_b, sign_bit);
+      filter_reg_1_a = veorq_s8(filter_reg_1_a, sign_bit);
+      filter_reg_1_b = veorq_s8(filter_reg_1_b, sign_bit);
+      filter_reg_2_a = veorq_s8(filter_reg_2_a, sign_bit);
+      filter_reg_2_b = veorq_s8(filter_reg_2_b, sign_bit);
+
+      vzipq_s8_in_place(&filter_reg_0_a, &filter_reg_0_b);
+      vzipq_s8_in_place(&filter_reg_1_a, &filter_reg_1_b);
+      vzipq_s8_in_place(&filter_reg_2_a, &filter_reg_2_b);
+      vzipq_s8x2_in_place(&filter_reg_0_a, &filter_reg_0_b);
+      vzipq_s8x2_in_place(&filter_reg_1_a, &filter_reg_1_b);
+      vzipq_s8x2_in_place(&filter_reg_2_a, &filter_reg_2_b);
+
+      vst1q_s8(shuffled_filter_data, filter_reg_0_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_0_b);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_1_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_1_b);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_2_a);
+      shuffled_filter_data += 16;
+      vst1q_s8(shuffled_filter_data, filter_reg_2_b);
+      shuffled_filter_data += 16;
+
+      int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      // For instance, if input_offset == 128, no adjustment is needed.
+
+      int32x4_t filter_sum_a = vdupq_n_s32(0);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_0_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_1_a, ones_vector);
+      filter_sum_a = vdotq_s32(filter_sum_a, filter_reg_2_a, ones_vector);
+      int32x4_t filter_sum_b = vdupq_n_s32(0);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_0_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_1_b, ones_vector);
+      filter_sum_b = vdotq_s32(filter_sum_b, filter_reg_2_b, ones_vector);
+
+      adjusted_bias_data_a = vmlaq_n_s32(adjusted_bias_data_a, filter_sum_a,
+                                         input_offset_difference);
+      adjusted_bias_data_b = vmlaq_n_s32(adjusted_bias_data_b, filter_sum_b,
+                                         input_offset_difference);
+
+      vst1q_s32(adjusted_bias_data, adjusted_bias_data_a);
+      adjusted_bias_data += 4;
+      vst1q_s32(adjusted_bias_data, adjusted_bias_data_b);
+      adjusted_bias_data += 4;
+
+      filter_block += 8;
+    }
+  }
+
+  static inline void Run(const uint8* filter_data, const int32* bias_data,
+                         int8* shuffled_filter_data, int32* adjusted_bias_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    ProcessPerDepthIntrinsics(filter_data, bias_data, shuffled_filter_data,
+                              adjusted_bias_data, function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockNeon(
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    static const uint8 perm_data[64] = {
+        0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,  //
+        4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+        8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+        12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63};
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 0);
+    constexpr uint8 kSignBit = 0x80;
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+    const int8x16_t perm_data_0 = vld1q_u8(perm_data);
+    const int8x16_t perm_data_1 = vld1q_u8(perm_data + 16);
+    const int8x16_t perm_data_2 = vld1q_u8(perm_data + 32);
+    const int8x16_t perm_data_3 = vld1q_u8(perm_data + 48);
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_0 = scratch_block_data;
+
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      const uint8* input_data_0 = input_block_data;
+      const uint8* input_data_1 = input_block_data + input_depth;
+      const uint8* input_data_2 = input_block_data + 2 * input_depth;
+      const uint8* input_data_3 = input_block_data + 3 * input_depth;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
+        int i_depth = 0;
+        for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
+          int8x16x4_t input_data;
+          input_data.val[0] = vld1q_u8(input_data_0);
+          input_data.val[1] = vld1q_u8(input_data_1);
+          input_data.val[2] = vld1q_u8(input_data_2);
+          input_data.val[3] = vld1q_u8(input_data_3);
+          input_data_1 += 16;
+          input_data_0 += 16;
+
+          int8x16_t tmp_0 = vqtbl4q_s8(input_data, perm_data_0);
+          int8x16_t tmp_1 = vqtbl4q_s8(input_data, perm_data_1);
+          work_reg_a = veorq_s8(tmp_0, sign_bit);
+          work_reg_b = veorq_s8(tmp_1, sign_bit);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_2 += 16;
+          input_data_3 += 16;
+
+          tmp_0 = vqtbl4q_s8(input_data, perm_data_2);
+          tmp_1 = vqtbl4q_s8(input_data, perm_data_3);
+          work_reg_a = veorq_s8(tmp_0, sign_bit);
+          work_reg_b = veorq_s8(tmp_1, sign_bit);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+        }
+        for (; i_depth < depth_micro_repeats; ++i_depth) {
+          int8x16x4_t input_data;
+          input_data.val[0] =
+              vld1q_lane_s8x8(input_data_0, input_data.val[0], 0);
+          input_data.val[1] =
+              vld1q_lane_s8x8(input_data_1, input_data.val[1], 0);
+          input_data.val[2] =
+              vld1q_lane_s8x8(input_data_2, input_data.val[2], 0);
+          input_data.val[3] =
+              vld1q_lane_s8x8(input_data_3, input_data.val[3], 0);
+          input_data_1 += 8;
+          input_data_0 += 8;
+
+          int8x16_t tmp_0 = vqtbl4q_s8(input_data, perm_data_0);
+          int8x16_t tmp_1 = vqtbl4q_s8(input_data, perm_data_1);
+          work_reg_a = veorq_s8(tmp_0, sign_bit);
+          work_reg_b = veorq_s8(tmp_1, sign_bit);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_2 += 8;
+          input_data_3 += 8;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+        input_data_1 += input_depth_skip;
+        input_data_2 += input_depth_skip;
+        input_data_3 += input_depth_skip;
+      }
+      if (width_overall_micro_repeats > input_width_micro_repeats) {
+        TFLITE_DCHECK_EQ(width_overall_micro_repeats,
+                         input_width_micro_repeats + 1);
+        TFLITE_DCHECK_GT(residual_width, 0);
+        TFLITE_DCHECK_LT(residual_width, 4);
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          work_reg_a = vdupq_n_u8(kSignBit);
+          work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+          work_reg_b = vdupq_n_u8(kSignBit);
+          if (residual_width > 1) {
+            work_reg_b =
+                vld1q_lane_s8x8(input_data_0 + input_depth, work_reg_b, 0);
+            if (residual_width == 3) {
+              work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                           work_reg_a, 1);
+            }
+          }
+          work_reg_a = veorq_s8(work_reg_a, sign_bit);
+          work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+          vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+          vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+          vst1q_s8(scratch_data_0, work_reg_a);
+          vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+          scratch_data_0 += depth_advance;
+          input_data_0 += 8;
+          input_data_1 += 8;
+          input_data_2 += 8;
+          input_data_3 += 8;
+        }
+        scratch_data_0 += width_advance;
+        input_data_0 += input_depth_skip;
+        input_data_1 += input_depth_skip;
+        input_data_2 += input_depth_skip;
+        input_data_3 += input_depth_skip;
+      }
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockNeon(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    constexpr uint8 kSignBit = 0x80;
+
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    TFLITE_DCHECK_GT(depth_micro_repeats, 0);
+    constexpr int kSymmetricZeroPoint = 128;
+
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg_a;
+    int8x16_t work_reg_b;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_0 = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      const uint8* input_data_0 = input_block_data;
+      const uint8* input_data_1 = input_block_data + input_depth;
+      const uint8* input_data_2 = input_block_data + 2 * input_depth;
+      const uint8* input_data_3 = input_block_data + 3 * input_depth;
+
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        // Figure out division of work (available input vs zero-ed).
+        int adjusted_residual_width =
+            j_width == (input_width_micro_repeats) ? residual_width : 4;
+
+        if (trailing_width_padding &&
+            j_width == (width_overall_micro_repeats - 1)) {
+          adjusted_residual_width -= 1;
+        }
+        int start_width = 0;
+        if (leading_width_padding && j_width == 0) {
+          start_width = 1;
+        }
+        if (start_width == 0) {
+          if (adjusted_residual_width == 4) {
+            // Load, then zero.
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vld1q_lane_s8x8(input_data_2, work_reg_a, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_3, work_reg_b, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_1, work_reg_b, 0);
+              input_data_1 += 8;
+              work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+              input_data_0 += 8;
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              scratch_data_0 += 16;
+              vst1q_s8(scratch_data_0, work_reg_b);
+
+              scratch_data_0 += depth_advance - 16;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vdupq_n_u8(-input_offset);
+              work_reg_b = vdupq_n_u8(-input_offset);
+              if (adjusted_residual_width > 0) {
+                work_reg_a = vld1q_lane_s8x8(input_data_0, work_reg_a, 0);
+                if (adjusted_residual_width > 1) {
+                  work_reg_b = vld1q_lane_s8x8(input_data_0 + input_depth,
+                                               work_reg_b, 0);
+                  if (adjusted_residual_width == 3) {
+                    work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                                 work_reg_a, 1);
+                  }
+                }
+              }
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+              input_data_1 += 8;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          }
+        } else {
+          if (adjusted_residual_width == 4) {
+            // Load, then zero.
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vdupq_n_u8(-input_offset);
+              work_reg_a = vld1q_lane_s8x8(input_data_2, work_reg_a, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_3, work_reg_b, 1);
+              work_reg_b = vld1q_lane_s8x8(input_data_1, work_reg_b, 0);
+              input_data_1 += 8;
+              // Skip loading first column.
+              input_data_0 += 8;
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              scratch_data_0 += 16;
+              vst1q_s8(scratch_data_0, work_reg_b);
+
+              scratch_data_0 += depth_advance - 16;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          } else {
+            TFLITE_DCHECK_LT(adjusted_residual_width, 4);
+            for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+              work_reg_a = vdupq_n_u8(-input_offset);
+              // Skip loading first column.
+              work_reg_b = vdupq_n_u8(-input_offset);
+              if (adjusted_residual_width > 1) {
+                work_reg_b =
+                    vld1q_lane_s8x8(input_data_0 + input_depth, work_reg_b, 0);
+                if (adjusted_residual_width == 3) {
+                  work_reg_a = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
+                                               work_reg_a, 1);
+                }
+              }
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+
+              vzipq_s8_in_place(&work_reg_a, &work_reg_b);
+              vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
+
+              vst1q_s8(scratch_data_0, work_reg_a);
+              vst1q_s8(scratch_data_0 + 16, work_reg_b);
+
+              scratch_data_0 += depth_advance;
+              input_data_0 += 8;
+              input_data_1 += 8;
+              input_data_2 += 8;
+              input_data_3 += 8;
+            }
+            scratch_data_0 += width_advance;
+            input_data_0 += input_depth_skip;
+            input_data_1 += input_depth_skip;
+            input_data_2 += input_depth_skip;
+            input_data_3 += input_depth_skip;
+          }
+        }
+      }
+      scratch_data_0 += height_advance;
+      input_block_data += input_height_stride;
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_data_0, -input_offset_difference, workspace_height_stride);
+      scratch_data_0 += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_0,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(height_block_number, width_block_number,
+                       input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/1> {
+  static inline void PackMacroBlockNeon(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    constexpr int kSymmetricZeroPoint = 128;
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+    // Adjusted so that later conditionals are simplified.
+    const int copy_size_adjusted =
+        trailing_width_padding ? copy_size + 1 : copy_size;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8 kSignBit = 0x80;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+    int8x8_t padding_mask;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+    const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
+    padding_mask = vdup_n_s8(-1);
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = (copy_size + start_width) & 0x7;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          work_reg = vld1q_u8(input_block_data + input_block_offset);
+          work_reg = vextq_s8(padding_reg, work_reg, 15);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          vst1q_s8(scratch_data, work_reg);
+          copy_done += 15;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              vld1q_u8(input_block_data + input_block_offset + copy_done);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
+          vst1q_s8(scratch_data + start_width + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_done);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_size - 8);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
+          half_work_reg =
+              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        }
+
+        // Trailing guard.
+        vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
+                                        half_work_reg, 0);
+          half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          vst1_lane_8x4(scratch_data, half_work_reg, 0);
+          copy_done += 3;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          // Important! Most compilation configurations will compile and run
+          // without the reinterpret_cast. Sanitizers may fail silently on
+          // lane-loading, with a obscure bug or mis-feature probably in
+          // unhygienic macro expansion.
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
+          half_work_reg =
+              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 4, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 8, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 12,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (width_overall_micro_repeats == 2) {
+      // Special case of 1 + 3 + 1, padding + copy + padding.
+      // This is rarely executed in practice.
+      TFLITE_DCHECK_EQ(copy_size, 3);
+      TFLITE_DCHECK_EQ(start_width, 1);
+      TFLITE_DCHECK(leading_width_padding);
+      TFLITE_DCHECK(trailing_width_padding);
+      // ASM should use MOVI 64-bit set.
+      padding_mask = vcreate_u64(~0xffffff00L);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
+                                         input_block_data + input_block_offset),
+                                     half_work_reg, 1);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 1),
+                         half_work_reg, 2);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 2),
+                         half_work_reg, 3);
+        half_work_reg =
+            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
+        vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+      if (leading_width_padding) {
+        padding_mask = vset_lane_u8(255, padding_mask, 0);
+      }
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vshl_n_u64(half_work_reg, 8);
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+        if (leading_width_padding) {
+          half_work_reg = vshl_n_s64(half_work_reg, 8);
+        }
+        half_work_reg =
+            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    if (trailing_height_padding) {
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(height_block_number, width_block_number,
+                       input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      /*max_padding=*/0> {
+  static inline void PackMacroBlockNeon(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    TFLITE_DCHECK_EQ(function_params->padding_left, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_right, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_top, 0);
+    TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    const int copy_block_height = block_height;
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    const int copy_size =
+        (width_overall_micro_repeats - 1) * 4 + adjusted_residual_width;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8 kSignBit = 0x80;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = copy_size & 0x7;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              vld1q_u8(input_block_data + input_block_offset + copy_done);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          TFLITE_DCHECK_EQ(copy_done % 16, 0);
+          vst1q_s8(scratch_data + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_done);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_size - 8);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 8, 0);
+          vst1_s8(scratch_data + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        // Trailing guard.
+        vst1_s8(scratch_data + copy_done, half_work_reg);
+        vst1_s8(scratch_data + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = copy_size & 0x3;
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          // Important! Most compilation configurations will compile and run
+          // without the reinterpret_cast. Sanitizers may fail silently on
+          // lane-loading, with a obscure bug or mis-feature probably in
+          // unhygienic macro expansion.
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ(copy_done % 4, 0);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 4, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 8, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 12, half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vshl_n_u64(half_work_reg, 8);
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    PreloadInputBlock(input_block_data, function_params);
+    PackMacroBlockNeon(height_block_number, width_block_number,
+                       input_block_data, scratch_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    const int8* input_data_depthwise = scratch_block_data;
+    uint8* output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+      filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+      filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int8* next_input_data = input_data_base;
+          uint8* output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(next_input_data + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(next_input_data + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(next_input_data + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(next_input_data + 4 * workspace_height_stride);
+          int8x16_t left_bank_5_reg =
+              vld1q_s8(next_input_data + 5 * workspace_height_stride);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+          acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+          acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+          acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+
+          for (int i_width = 0; i_width < output_width_micro_repeats;
+               ++i_width) {
+            next_input_data += width_micro_stride;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += depth;
+            }
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+            // Logic: (i_width == output_width_micro_repeats) &&
+            //        ((residual_width - 1) * stride_val < 2)
+            const bool no_right_block =
+                i_width == output_width_micro_repeats && residual_width < 3;
+
+            if (no_right_block) {
+              // Only needed for santizer checks.
+              right_bank_0_reg = vdupq_n_s8(0);
+              right_bank_1_reg = vdupq_n_s8(0);
+              right_bank_2_reg = vdupq_n_s8(0);
+              right_bank_3_reg = vdupq_n_s8(0);
+              right_bank_4_reg = vdupq_n_s8(0);
+              right_bank_5_reg = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg = vld1q_s8(next_input_data);
+              right_bank_1_reg =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_3_reg =
+                  vld1q_s8(next_input_data + 3 * workspace_height_stride);
+              right_bank_4_reg =
+                  vld1q_s8(next_input_data + 4 * workspace_height_stride);
+              right_bank_5_reg =
+                  vld1q_s8(next_input_data + 5 * workspace_height_stride);
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+              left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
+              left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
+              left_bank_5_reg = vrev32q_u16(left_bank_5_reg);
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+              vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+              left_bank_3_reg = right_bank_3_reg;
+              left_bank_4_reg = right_bank_4_reg;
+              left_bank_5_reg = right_bank_5_reg;
+
+              output_data += depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+
+          if (residual_width > 0) {
+            next_input_data += width_micro_stride;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            int8x16_t right_bank_0_reg;
+            int8x16_t right_bank_1_reg;
+            int8x16_t right_bank_2_reg;
+            int8x16_t right_bank_3_reg;
+            int8x16_t right_bank_4_reg;
+            int8x16_t right_bank_5_reg;
+            // Logic: (output_width - 1) * stride_val < 2.
+            const bool no_right_block = output_width < 3;
+
+            if (no_right_block) {
+              // Only needed for santizer checks.
+              right_bank_0_reg = vdupq_n_s8(0);
+              right_bank_1_reg = vdupq_n_s8(0);
+              right_bank_2_reg = vdupq_n_s8(0);
+              right_bank_3_reg = vdupq_n_s8(0);
+              right_bank_4_reg = vdupq_n_s8(0);
+              right_bank_5_reg = vdupq_n_s8(0);
+            } else {
+              right_bank_0_reg = vld1q_s8(next_input_data);
+              right_bank_1_reg =
+                  vld1q_s8(next_input_data + workspace_height_stride);
+              right_bank_2_reg =
+                  vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              right_bank_3_reg =
+                  vld1q_s8(next_input_data + 3 * workspace_height_stride);
+              right_bank_4_reg =
+                  vld1q_s8(next_input_data + 4 * workspace_height_stride);
+              right_bank_5_reg =
+                  vld1q_s8(next_input_data + 5 * workspace_height_stride);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
+              biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
+              biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
+              biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
+              biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
+              biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
+
+              output_data += depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
+              acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
+              acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
+            }
+          }
+          input_data_base += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+          filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+          filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+        }
+      } else {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          for (int k_height = 0; k_height < block_height; ++k_height) {
+            const int8* next_input_data = input_data_base;
+            uint8* output_data = output_data_base;
+
+            // Load first sub-micro block of data into operational banks.
+            int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
+            int8x16_t left_bank_1_reg =
+                vld1q_s8(next_input_data + workspace_height_stride);
+            int8x16_t left_bank_2_reg =
+                vld1q_s8(next_input_data + 2 * workspace_height_stride);
+
+            for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+                 ++i_width) {
+              next_input_data += width_micro_stride;
+              const int output_width =
+                  i_width == output_width_micro_repeats ? residual_width : 4;
+
+              // Load next sub-micro block of data.
+              int8x16_t right_bank_0_reg;
+              int8x16_t right_bank_1_reg;
+              int8x16_t right_bank_2_reg;
+              // Logic: (output_width - 1) * stride_val < 2.
+              const bool no_right_block = output_width < 3;
+
+              if (no_right_block) {
+                // Only needed for santizer checks.
+                right_bank_0_reg = vdupq_n_s8(0);
+                right_bank_1_reg = vdupq_n_s8(0);
+                right_bank_2_reg = vdupq_n_s8(0);
+              } else {
+                right_bank_0_reg = vld1q_s8(next_input_data);
+                right_bank_1_reg =
+                    vld1q_s8(next_input_data + workspace_height_stride);
+                right_bank_2_reg =
+                    vld1q_s8(next_input_data + 2 * workspace_height_stride);
+              }
+              // Load next sub-micro block of data.
+
+              // Iterate over input width shifts within 4x4 blocks.
+              for (int x = 0; x < output_width; ++x) {
+                int32x4_t acc = adjusted_bias_data;
+                acc = vdotq_s32(acc, filter_reg_0_a, left_bank_0_reg);
+                acc = vdotq_s32(acc, filter_reg_1_a, left_bank_1_reg);
+                acc = vdotq_s32(acc, filter_reg_2_a, left_bank_2_reg);
+
+                // Fixed-point multiplication.
+                acc = vqrdmulhq_n_s32(acc, output_multiplier);
+                acc = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                    acc, -output_shift);
+                // Add the output offset.
+                // Note that we need to fill the top half with vcombine, but can
+                // drop the instruction in ASM code.
+                int16x8_t acc_s16_0_0 =
+                    vcombine_s16(vqmovn_s32(acc), vqmovn_s32(acc));
+                acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+                // Apply the activation function.
+                uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
+                acc_u8_0_0 =
+                    vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
+                acc_u8_0_0 =
+                    vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
+
+                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
+
+                biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
+                biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
+                biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
+
+                output_data += depth;
+              }
+            }
+            input_data_base += workspace_height_stride;
+            output_data_base += output_height_stride;
+          }
+
+          // Move to next sub-block: advance to second set of filters.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+        }
+      }
+      input_data_depthwise += depth_micro_stride;
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int workspace_width_micro_repeats =
+        function_params->workspace_width_micro_repeats;
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    // This version only does min/max on 64 bits.
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x8_t output_activation_min_vec =
+        vdup_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x8_t output_activation_max_vec =
+        vdup_n_u8(static_cast<uint8>(output_activation_max));
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    TFLITE_DCHECK_EQ(stride_val, 2);
+    TFLITE_DCHECK_LE(block_height, 2);
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const int8* filter_block =
+          filter_workspace + shuffled_filter_increment * j_depth;
+
+      if (block_height == 2) {
+        for (int s = 0; s < 2; ++s) {
+          // Simulate NEON-register transposition of subset of filter.
+          int8x16_t filter_reg_0_a;
+          int8x16_t filter_reg_1_a;
+          int8x16_t filter_reg_2_a;
+
+          filter_reg_0_a = vld1q_s8(filter_block + s * 16);
+          filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
+          filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
+
+          const int8* scratch_data =
+              scratch_block_data + depth_micro_stride * j_depth;
+          uint8* output_data = output_block_data + 8 * j_depth;
+          const int8* input_data_0 = scratch_data + s * 2 * 8;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(input_data_0 + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+          int8x16_t left_bank_3_reg =
+              vld1q_s8(input_data_0 + 3 * workspace_height_stride);
+          int8x16_t left_bank_4_reg =
+              vld1q_s8(input_data_0 + 4 * workspace_height_stride);
+
+          int8x16_t right_bank_0_reg;
+          int8x16_t right_bank_1_reg;
+          int8x16_t right_bank_2_reg;
+          int8x16_t right_bank_3_reg;
+          int8x16_t right_bank_4_reg;
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+
+            if (!no_right_block) {
+              // Load next sub-micro block of data.
+              right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
+              right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride);
+              right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride);
+              right_bank_3_reg = vld1q_s8(input_data + width_micro_stride +
+                                          3 * workspace_height_stride);
+              right_bank_4_reg = vld1q_s8(input_data + width_micro_stride +
+                                          4 * workspace_height_stride);
+            }
+
+            uint8* output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base, acc_u8, 0);
+              vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
+
+              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+              left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
+              left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
+              vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
+            }
+
+            if (output_width > 1) {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_2_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_3_reg);
+              acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_4_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
+              vst1_lane_8x4(output_data_base + depth + output_height_stride,
+                            acc_u8, 1);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+              left_bank_3_reg = right_bank_3_reg;
+              left_bank_4_reg = right_bank_4_reg;
+            }
+          }
+          bias_data += bias_increment;
+        }
+      } else {
+        for (int s = 0; s < 2; ++s) {
+          // Simulate NEON-register transposition of subset of filter.
+          int8x16_t filter_reg_0_a;
+          int8x16_t filter_reg_1_a;
+          int8x16_t filter_reg_2_a;
+
+          filter_reg_0_a = vld1q_s8(filter_block + s * 16);
+          filter_reg_1_a = vld1q_s8(filter_block + s * 16 + 32);
+          filter_reg_2_a = vld1q_s8(filter_block + s * 16 + 64);
+
+          const int8* scratch_data =
+              scratch_block_data + depth_micro_stride * j_depth;
+          uint8* output_data = output_block_data + 8 * j_depth;
+          const int8* input_data_0 = scratch_data + s * 2 * 8;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+
+          // Load first sub-micro block of data into operational banks.
+          int8x16_t left_bank_0_reg = vld1q_s8(input_data_0);
+          int8x16_t left_bank_1_reg =
+              vld1q_s8(input_data_0 + workspace_height_stride);
+          int8x16_t left_bank_2_reg =
+              vld1q_s8(input_data_0 + 2 * workspace_height_stride);
+
+          int8x16_t right_bank_0_reg;
+          int8x16_t right_bank_1_reg;
+          int8x16_t right_bank_2_reg;
+
+          int32x4_t acc0;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+
+            if (!no_right_block) {
+              // Load next sub-micro block of data.
+              right_bank_0_reg = vld1q_s8(input_data + width_micro_stride);
+              right_bank_1_reg = vld1q_s8(input_data + width_micro_stride +
+                                          workspace_height_stride);
+              right_bank_2_reg = vld1q_s8(input_data + width_micro_stride +
+                                          2 * workspace_height_stride);
+            }
+
+            uint8* output_data_base = output_data + depth * 2 * i_width + 4 * s;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc0));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base, acc_u8, 0);
+
+              left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
+              left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
+              left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
+              vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
+              vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
+              vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
+            }
+
+            if (output_width > 1) {
+              acc0 = adjusted_bias_data;
+
+              acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
+              acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc0));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              // Apply the activation function.
+              uint8x8_t acc_u8 = vqmovun_s16(acc_s16_0_1);
+              acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
+              acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
+
+              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
+
+              left_bank_0_reg = right_bank_0_reg;
+              left_bank_1_reg = right_bank_1_reg;
+              left_bank_2_reg = right_bank_2_reg;
+            }
+          }
+          bias_data += bias_increment;
+        }
+      }
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    TFLITE_DCHECK_EQ(function_params->stride, 1);
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    uint8* output_data_depthwise = output_block_data;
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      // Simulate NEON-register transposition of subset of filter.
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+      int8x16_t filter_reg_0_a_shifted;
+      int8x16_t filter_reg_1_a_shifted;
+      int8x16_t filter_reg_2_a_shifted;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+      filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+      filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+
+      if (block_height == 4) {
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int8* next_input_data = scratch_block_data;
+          uint8* output_data = output_data_base;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+          int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+          int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
+
+          // Load first sub-micro block of data into operational banks.
+          input_bank_a_reg =
+              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                // uninitialized variable.
+          input_bank_a_reg = vld1q_lane_8x4(
+              next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+          input_bank_b_reg = vld1q_dup_s8x4(
+              next_input_data +
+              2 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_b_reg =
+              vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                             input_bank_b_reg, 2);
+          input_bank_c_reg = vld1q_dup_s8x4(
+              next_input_data +
+              4 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
+          input_bank_c_reg =
+              vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                             input_bank_c_reg, 2);
+
+          int32x4_t acc0;
+          int32x4_t acc1;
+          int32x4_t acc2;
+          int32x4_t acc3;
+
+          acc0 = adjusted_bias_data;
+          acc1 = adjusted_bias_data;
+          acc2 = adjusted_bias_data;
+          acc3 = adjusted_bias_data;
+
+          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
+          acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
+          acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);
+
+          for (int i_width = 0; i_width < output_width_micro_repeats;
+               ++i_width) {
+            next_input_data += 4;
+
+            // Iterate over input width shifts within 4x4 blocks.
+            {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += output_depth;
+            }
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              output_data += output_depth;
+            }
+
+            {
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
+                                         input_bank_a_reg, 2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
+                                         input_bank_b_reg, 2);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
+                                         input_bank_c_reg, 0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
+                                         input_bank_c_reg, 2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+              output_data += output_depth;
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+
+          if (residual_width > 0) {
+            next_input_data += 4;
+            const int output_width = residual_width;
+
+            // Load next sub-micro block of data.
+            input_bank_a_reg =
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
+                                         0);
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
+                                         2);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
+                                         2);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
+                                         2);
+
+              // Fixed-point multiplication.
+              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc0, -output_shift);
+              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc1, -output_shift);
+              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
+              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc2, -output_shift);
+              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
+              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                  acc3, -output_shift);
+              // Add the output offset.
+              int16x8_t acc_s16_0_1 =
+                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+              int16x8_t acc_s16_2_3 =
+                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
+              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
+              // Apply the activation function.
+              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
+                                                  vqmovun_s16(acc_s16_2_3));
+              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
+              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
+
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
+
+              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
+              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
+              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 8);
+
+              output_data += output_depth;
+
+              acc0 = adjusted_bias_data;
+              acc1 = adjusted_bias_data;
+              acc2 = adjusted_bias_data;
+              acc3 = adjusted_bias_data;
+
+              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
+                                         0);
+              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
+                                         0);
+              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
+                                         0);
+              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
+                                         2);
+            }
+          }
+          // scratch_block_data += 4 * workspace_height_stride;
+          output_data_base += 4 * output_height_stride;
+
+          // Move to next sub-block: advance to second set of filters, to new
+          // bias.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+          filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
+          filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
+          filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
+        }
+      } else {
+        // Block height < 4.
+        for (int s = 0; s < 2; ++s) {
+          // Work through one slice, by row, at a time.
+          uint8* output_data_base = output_data_depthwise + 4 * s;
+
+          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
+          TFLITE_DCHECK_EQ(bias_increment, 4);
+          bias_data += bias_increment;
+
+          for (int k_height = 0; k_height < block_height; ++k_height) {
+            const int8* next_input_data =
+                scratch_block_data + k_height * workspace_height_stride;
+            uint8* output_data = output_data_base;
+
+            int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+            int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+
+            // Load first sub-micro block of data into operational banks.
+            input_bank_a_reg =
+                vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                  // uninitialized variable.
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+            input_bank_b_reg = vld1q_dup_s8x4(
+                next_input_data +
+                2 * workspace_height_stride);  // Load lane 0, avoiding
+                                               // uninitialized variable.
+
+            for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+                 ++i_width) {
+              next_input_data += 4;
+              const int output_width =
+                  i_width == output_width_micro_repeats ? residual_width : 4;
+
+              // Load next sub-micro block of data.
+              input_bank_a_reg =
+                  vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+              input_bank_a_reg =
+                  vld1q_lane_8x4(next_input_data + workspace_height_stride,
+                                 input_bank_a_reg, 3);
+              input_bank_b_reg =
+                  vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                                 input_bank_b_reg, 1);
+              // Iterate over input width shifts within 4x4 blocks.
+              for (int x = 0; x < output_width; ++x) {
+                int32x4_t acc = adjusted_bias_data;
+                acc = vdotq_four_lane_s32(acc, filter_reg_0_a, input_bank_a_reg,
+                                          0);
+                acc = vdotq_four_lane_s32(acc, filter_reg_1_a, input_bank_a_reg,
+                                          2);
+                acc = vdotq_four_lane_s32(acc, filter_reg_2_a, input_bank_b_reg,
+                                          0);
+
+                // Fixed-point multiplication.
+                acc = vqrdmulhq_n_s32(acc, output_multiplier);
+                acc = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                    acc, -output_shift);
+                // Add the output offset.
+                // Note that we need to fill the top half with vcombine, but can
+                // drop the instruction in ASM code.
+                int16x8_t acc_s16_0_0 =
+                    vcombine_s16(vqmovn_s32(acc), vqmovn_s32(acc));
+                acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
+                // Apply the activation function.
+                uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
+                acc_u8_0_0 =
+                    vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
+                acc_u8_0_0 =
+                    vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
+
+                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
+
+                input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
+                input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
+
+                output_data += output_depth;
+              }
+            }
+            output_data_base += output_height_stride;
+          }
+
+          // Move to next sub-block: advance to second set of filters.
+          filter_reg_0_a = filter_reg_0_b;
+          filter_reg_1_a = filter_reg_1_b;
+          filter_reg_2_a = filter_reg_2_b;
+        }
+      }
+      output_data_depthwise += 8;
+    }
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 1);
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      TFLITE_DCHECK_EQ(bias_increment, 4);
+      const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+
+      if (block_height == 2) {
+        const int8* scratch_data = scratch_block_data;
+        uint8* output_data = output_block_data + 8 * j_depth;
+
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+        int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_b_reg = vld1q_lane_8x4(
+            scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
+        input_bank_c_reg = vld1q_dup_s8x4(
+            scratch_data +
+            4 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : four_over_stride;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
+          input_bank_c_reg = vld1q_lane_8x4(
+              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                          1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                          1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+            output_data += output_depth;
+          }
+        }
+      } else {
+        TFLITE_DCHECK_EQ(block_height, 1);
+        // Work through one slice, by row, at a time.
+        const int8* scratch_data = scratch_block_data;
+        uint8* output_data = output_block_data + 8 * j_depth;
+
+        //
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width =
+              i_width == output_width_micro_repeats ? residual_width : 2;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+
+            output_data += output_depth;
+          }
+        }
+      }
+    }
+  }
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+#endif  // USE_NEON && __aarch64__
+
 // Top-level implementation function for 3x3 depthwise convolution using NEON
 // dot-product instructions.
 //
@@ -4133,6 +7520,13 @@
   }
 }
 
+#undef vst1_lane_8x4
+#undef vst1q_lane_8x4
+#undef vld1q_lane_s8x8
+#undef vld1_lane_8x4
+#undef vld1q_lane_8x4
+#undef vld1q_dup_s8x4
+
 #undef STR
 #undef STR_UNEXPANDED
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
index 60b5565..e7fafa0 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
@@ -36,174 +36,26 @@
 namespace optimized_ops {
 namespace depthwise_conv {
 
-#if defined(USE_NEON)
+#ifdef USE_NEON
+// Lane operations are for clarity and convenience. We want to load and store
+// 4 8-bit lanes together. So these are treated much like 32-bit loads and
+// 32-bit stores. Stores require 32-bit alignment.
 
-#define vst1_lane_u8x4(dst, reg, lane_num)                                  \
-  vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpret_u32_u8(reg), \
-                lane_num)
-
-#define vst1q_lane_u8x4(dst, reg, lane_num)                                   \
-  vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), vreinterpretq_u32_u8(reg), \
-                                                                              \
-                 lane_num)
+#define vst1_lane_8x4(dst, reg, lane_num)                         \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
+  vst1_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
+#define vst1q_lane_8x4(dst, reg, lane_num)                        \
+  TFLITE_DCHECK_EQ(reinterpret_cast<std::uintptr_t>(dst) % 4, 0); \
+  vst1q_lane_u32(reinterpret_cast<uint32_t*>(dst), reg, lane_num)
 
 #define vld1q_lane_s8x8(src, reg, lane_num) \
   vld1q_lane_u64(reinterpret_cast<const uint64_t*>(src), reg, lane_num)
-
-#ifndef __aarch64__
-inline int8x16_t vqtbl4q_s8(int8x16x4_t a, uint8x16_t b) {
-  const uint8x16_t mask = vtstq_u8(b, vdupq_n_u8(8));
-
-  // Delete bit 3 from the indices.
-  const uint8x16_t high_bits = vshrq_n_u8(b, 4);
-  uint8x16_t deleted_bit_3 = b;
-  deleted_bit_3 = vsliq_n_u8(deleted_bit_3, high_bits, 3);
-
-  int8x8x4_t repacked_data;
-
-  // Calculate for lower indices.
-  repacked_data.val[0] = vget_low_u8(a.val[0]);
-  repacked_data.val[1] = vget_low_u8(a.val[1]);
-  repacked_data.val[2] = vget_low_u8(a.val[2]);
-  repacked_data.val[3] = vget_low_u8(a.val[3]);
-  const int8x16_t output_for_lower =
-      vcombine_u8(vtbl4_s8(repacked_data, vget_low_u8(deleted_bit_3)),
-                  vtbl4_s8(repacked_data, vget_high_u8(deleted_bit_3)));
-
-  // Calculate for high indices.
-  repacked_data.val[0] = vget_high_u8(a.val[0]);
-  repacked_data.val[1] = vget_high_u8(a.val[1]);
-  repacked_data.val[2] = vget_high_u8(a.val[2]);
-  repacked_data.val[3] = vget_high_u8(a.val[3]);
-  const int8x16_t output_for_higher =
-      vcombine_u8(vtbl4_s8(repacked_data, vget_low_u8(deleted_bit_3)),
-                  vtbl4_s8(repacked_data, vget_high_u8(deleted_bit_3)));
-
-  // Merge.
-  int8x16_t output = mask;
-  output = vbslq_u8(output, output_for_higher, output_for_lower);
-  return output;
-}
-#endif  // !__aarch64__
-
-// Convenience-compatibility functions.
-// Compatibility: Intrinsics reflect a mixture of older and newer ARM
-//     instructions. This actually results in ZIP1 / ZIP2 asm instructions, but
-//     one intrinsic is provided. Also older instructions operated in place,
-//     and it seems more defensive to assume that some versions of intrinsics
-//     might reflect this
-// Convenience: Callers in these kernels want both ZIP1 and ZIP2, and we do not
-//     want the calling code to get cluttered with unpacking int8x16x2_t.
-inline void vzipq_s8_in_place(int8x16_t* a, int8x16_t* b) {
-  int8x16x2_t r8x16;
-  r8x16 = vzipq_s8(*a, *b);
-  *a = r8x16.val[0];
-  *b = r8x16.val[1];
-}
-
-inline void vzipq_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
-  int16x8x2_t r16x8;
-  r16x8 = vzipq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
-  *a = vreinterpretq_s8_s16(r16x8.val[0]);
-  *b = vreinterpretq_s8_s16(r16x8.val[1]);
-}
-
-// Similar rationale to the zip-in_place functions, but callers only actually
-// need the TRN1 asm instruction result.
-inline void vtrn1_s8x2_in_place(int8x16_t* a, int8x16_t* b) {
-  int16x8x2_t r16x8;
-  r16x8 = vtrnq_s16(vreinterpretq_s16_s8(*a), vreinterpretq_s16_s8(*b));
-  *a = vreinterpretq_s8_s16(r16x8.val[0]);
-}
-
-inline void biregister_rotate_8(int8x16_t* left, int8x16_t* right) {
-  *left = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*left), 8));
-  *left = vreinterpretq_s8_u32(vsliq_n_u32(vreinterpretq_u32_s8(*left),
-                                           vreinterpretq_u32_s8(*right), 24));
-  *right = vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(*right), 8));
-}
-
-#ifndef __aarch64__
-inline int32x4_t vpaddq_s32(int32x4_t a, int8x16_t b) {
-  int32x4x2_t deinterleaved = vuzpq_s32(a, b);
-  return vqaddq_s32(deinterleaved.val[0], deinterleaved.val[1]);
-}
-#endif  // !__aarch64__
-
-#ifdef __ARM_FEATURE_DOTPROD
-// The vdotq_lane_s32 takes int8x8t for the rhs parameter, whereas the actual
-// instruction selects from between 4 32-bit (4x8-bit packed) sub-registers, an
-// unusual interpretation of "lane".
-inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
-                                     int8x16_t rhs, const int lane) {
-  switch (lane) {
-    case 0:
-      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 0);
-    case 1:
-      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_low_s8(rhs)), 1);
-    case 2:
-      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
-                            0);
-    case 3:
-    default:
-      return vdotq_lane_s32(acc, lhs, vreinterpret_s32_s8(vget_high_s8(rhs)),
-                            1);
-  }
-}
-
-#else
-
-inline int32x4_t vdotq_s32(int32x4_t acc, int8x16_t lhs, int8x16_t rhs) {
-  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs)));
-  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs)));
-  int32x4_t sum = vpaddq_s32(sum0, sum1);
-  return vaddq_s32(acc, sum);
-}
-
-inline int32x4_t vdotq_four_lane_s32(int32x4_t acc, int8x16_t lhs,
-                                     int8x16_t rhs, int lane) {
-  int8x8_t lane_rhs;
-  if (lane == 0) {
-    lane_rhs = vreinterpret_s8_s32(
-        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 0));
-  } else if (lane == 1) {
-    lane_rhs = vreinterpret_s8_s32(
-        vdup_lane_s32(vreinterpret_s32_s8(vget_low_s8(rhs)), 1));
-  } else if (lane == 2) {
-    lane_rhs = vreinterpret_s8_s32(
-        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 0));
-  } else {
-    lane_rhs = vreinterpret_s8_s32(
-        vdup_lane_s32(vreinterpret_s32_s8(vget_high_s8(rhs)), 1));
-  }
-  int32x4_t sum0 = vpaddlq_s16(vmull_s8(vget_low_s8(lhs), lane_rhs));
-  int32x4_t sum1 = vpaddlq_s16(vmull_s8(vget_high_s8(lhs), lane_rhs));
-  int32x4_t sum = vpaddq_s32(sum0, sum1);
-  return vaddq_s32(acc, sum);
-}
-#endif  // !DOTPROD
-#endif  // ARM NEON
-
-template <DepthwiseConvOutputRounding output_rounding>
-struct DivideByPOT {};
-
-template <>
-struct DivideByPOT<DepthwiseConvOutputRounding::kAwayFromZero> {
-  template <typename IntegerType>
-  static inline IntegerType Run(IntegerType x, int exponent) {
-    return RoundingDivideByPOT(x, exponent);
-  }
-};
-
-#if defined(USE_NEON)
-template <>
-struct DivideByPOT<DepthwiseConvOutputRounding::kUpward> {
-  template <typename IntegerType>
-  static inline IntegerType Run(IntegerType x, int exponent) {
-    return vqrshlq_s32(x, vdupq_n_s32(static_cast<int32>(-exponent)));
-  }
-};
-#endif  // ARM NEON
+#define vld1_lane_8x4(src, reg, lane_num) \
+  vld1_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
+#define vld1q_lane_8x4(src, reg, lane_num) \
+  vld1q_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
+#define vld1q_dup_s8x4(src) vld1q_dup_s32(reinterpret_cast<const int32*>(src))
+#endif  // USE_NEON
 
 template <>
 struct ProcessPerDepth<DepthwiseConvImplementation::kUseCModel3x3DotProduct> {
@@ -305,7 +157,120 @@
   }
 };
 
-#if defined(USE_NEON)
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseUnwound3x3DotProduct> {
+  static inline void Run(const uint8* filter_data, const int32* bias_data,
+                         int8* shuffled_filter_data, int32* adjusted_bias_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int depth = function_params->output_depth;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int bias_increment = function_params->bias_increment;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
+    int8 filter_bank_a_1[4][4];
+    int8 filter_bank_a_2[4][4];
+    int8 filter_bank_b_0[4][4];
+    int8 filter_bank_b_1[4][4];
+    int8 filter_bank_b_2[4][4];
+
+    // Load filter data in, essentially dropping the [depth/8] dimension, which
+    // is equivalent to loading just the depth needed for one micro-block.
+    //
+    // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
+    // depth 4.
+    uint8 loaded_filter_0[4][2][4];
+    uint8 loaded_filter_1[4][2][4];
+    uint8 loaded_filter_2[4][2][4];
+
+    constexpr int kSymmetricZeroPoint = 128;
+    const int32 input_offset = function_params->input_offset;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const uint8* filter_block = filter_data + 8 * j_depth;
+
+      // Filter data is provided as filter_block[3][3][depth/8][2][4].
+      // height 3, width 3, micro-blocks, sub-block 0 or 1, depth 4.
+      // filter_bank[3][2][4][4]; Sub-block, height 3, depth 4, width 4.
+      for (int x = 0; x < 3; ++x) {
+        memcpy(loaded_filter_0[x][0], &filter_block[3 * 0 * depth + x * depth],
+               8);
+        memcpy(loaded_filter_1[x][0], &filter_block[3 * 1 * depth + x * depth],
+               8);
+        memcpy(loaded_filter_2[x][0], &filter_block[3 * 2 * depth + x * depth],
+               8);
+      }
+      // Pad the filter with -filter_offset, so that the values become 0 when
+      // the filter_offset is later added, and so the filter tap is effectively
+      // disregarded.
+      memset(loaded_filter_0[3][0], kSymmetricZeroPoint, 8);
+      memset(loaded_filter_1[3][0], kSymmetricZeroPoint, 8);
+      memset(loaded_filter_2[3][0], kSymmetricZeroPoint, 8);
+
+      for (int z = 0; z < 4; ++z) {
+        for (int x = 0; x < 4; ++x) {
+          filter_bank_a_0[z][x] =
+              loaded_filter_0[x][0][z] - kSymmetricZeroPoint;
+          filter_bank_b_0[z][x] =
+              loaded_filter_0[x][1][z] - kSymmetricZeroPoint;
+          filter_bank_a_1[z][x] =
+              loaded_filter_1[x][0][z] - kSymmetricZeroPoint;
+          filter_bank_b_1[z][x] =
+              loaded_filter_1[x][1][z] - kSymmetricZeroPoint;
+          filter_bank_a_2[z][x] =
+              loaded_filter_2[x][0][z] - kSymmetricZeroPoint;
+          filter_bank_b_2[z][x] =
+              loaded_filter_2[x][1][z] - kSymmetricZeroPoint;
+        }
+      }
+
+      memcpy(shuffled_filter_data, filter_bank_a_0, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_b_0, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_a_1, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_b_1, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_a_2, 16);
+      shuffled_filter_data += 16;
+      memcpy(shuffled_filter_data, filter_bank_b_2, 16);
+      shuffled_filter_data += 16;
+
+      int32 adjusted_bias_data_0[4];
+      int32 adjusted_bias_data_1[4];
+      // For instance, if input_offset == 128, no adjustment is needed.
+      for (int z = 0; z < 4; ++z) {
+        adjusted_bias_data_0[z] = bias_data[z];
+        adjusted_bias_data_1[z] = bias_data[4 + z];
+        for (int x = 0; x < 4; ++x) {
+          adjusted_bias_data_0[z] +=
+              input_offset_difference * filter_bank_a_0[z][x];
+          adjusted_bias_data_0[z] +=
+              input_offset_difference * filter_bank_a_1[z][x];
+          adjusted_bias_data_0[z] +=
+              input_offset_difference * filter_bank_a_2[z][x];
+          adjusted_bias_data_1[z] +=
+              input_offset_difference * filter_bank_b_0[z][x];
+          adjusted_bias_data_1[z] +=
+              input_offset_difference * filter_bank_b_1[z][x];
+          adjusted_bias_data_1[z] +=
+              input_offset_difference * filter_bank_b_2[z][x];
+
+          adjusted_bias_data[z] = adjusted_bias_data_0[z];
+          adjusted_bias_data[4 + z] = adjusted_bias_data_1[z];
+        }
+      }
+      bias_data += 2 * bias_increment;
+      adjusted_bias_data += 8;
+    }
+  }
+};
+
+#ifdef USE_NEON
 template <>
 struct ProcessPerDepth<
     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct> {
@@ -741,37 +706,490 @@
   }
 };
 
-#if defined(USE_NEON)
-#if defined(__aarch64__)
-// Experiments suggest that a modest performance improvement is seen, at least
-// on 855 chipset big cores, with cache hints.
-inline void PreloadInputBlock(
-    const uint8* input_block_data,
-    const DepthwiseConvDotProdParams* function_params) {
-  // Preload.
-  const int input_width_micro_repeats =
-      function_params->input_width_micro_repeats;
-  const int block_height = function_params->inbound_block_height;
-  const int residual_width = function_params->residual_width;
-  const int input_height_stride = function_params->input_height_stride;
-  const int input_depth = function_params->input_depth;
+// Beginning of code section containing intermediate code transformation.
+//
+// This section is only compiled when kUseUnwound3x3DotProduct versions of
+// templated functions are selected.
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/0> {
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+    const int input_depth = function_params->input_depth;
 
-  {
-    const int total_width = 4 * input_width_micro_repeats + residual_width;
-    const uint8* row_ptr = input_block_data;
+    TFLITE_DCHECK_GE(depth_micro_repeats, 0);
+    constexpr int kSymmetricZeroPoint = 128;
+    const int micro_block_size = 4 * 8;
+    const int depth_advance = width_overall_micro_repeats * micro_block_size;
+    const int width_advance =
+        micro_block_size *
+        (1 - depth_micro_repeats * width_overall_micro_repeats);
+    const int height_advance = workspace_height_stride -
+                               width_overall_micro_repeats * micro_block_size;
+    const int input_depth_skip = 4 * input_depth - 8 * depth_micro_repeats;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in the
+    // NEON code we are simulating. Note the blocks of 4x4 are still interleaved
+    // down the depth.
+    int8 tmp_load[4][2][4];
+    int8 tmp_transposed[4][2][4];
+    int8 tmp_interleaved[2][4][4];
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data = scratch_block_data;
     for (int k_height = 0; k_height < block_height; ++k_height) {
-      const uint8* ptr = row_ptr;
-      for (int j = 0; j < total_width; ++j) {
-        // Input data is loaded once.
-        asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-        ptr += input_depth;
-      }
-      row_ptr += input_height_stride;
-    }
-  }
-}
-#endif
+      const uint8* input_data = input_block_data;
+      input_block_data += input_height_stride;
 
+      // Traverse the width one point at a time, but the depth in (micro) blocks
+      // of size 8.
+      //
+      // The depth and width margins, which are filled with "zeros", may be
+      // larger than is strictly needed to calculate output. This is because the
+      // conv calculation is performed across complete micro blocks.
+      for (int j_width = 0; j_width < input_width_micro_repeats; ++j_width) {
+        // Load, then zero.
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // A. Simulate register loading.
+          for (int x = 0; x < 4; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C and D are to be performed together as 4-byte stores in NEON code.
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
+          memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
+          memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
+
+          scratch_data += depth_advance;
+          input_data += 8;
+        }
+        scratch_data += width_advance;
+        input_data += input_depth_skip;
+      }
+      if (width_overall_micro_repeats > input_width_micro_repeats) {
+        TFLITE_DCHECK_EQ(width_overall_micro_repeats,
+                         input_width_micro_repeats + 1);
+        TFLITE_DCHECK_GT(residual_width, 0);
+        // Figure out division of work (available input vs zero-ed).
+        const int adjusted_residual_width = residual_width;
+        // Load, then zero.
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // A. Simulate register loading.
+          for (int x = 0; x < adjusted_residual_width; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+          for (int x = adjusted_residual_width; x < 4; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = 0;
+              }
+            }
+          }
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C and D are to be performed together as 4-byte stores in NEON code.
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 8);
+          memcpy(&scratch_data[8], tmp_interleaved[0][2], 8);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 8);
+          memcpy(&scratch_data[24], tmp_interleaved[1][2], 8);
+
+          scratch_data += depth_advance;
+          input_data += 8;
+        }
+        scratch_data += width_advance;
+        input_data += input_depth_skip;
+      }
+      scratch_data += height_advance;
+    }
+
+    TFLITE_DCHECK_EQ(scratch_data, scratch_block_data +
+                                       block_height * workspace_height_stride);
+  }
+};
+
+template <>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      /*max_padding=*/1> {
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    // Just use C model code for case of padding. Optimized versions merge the
+    // modifications therein to handle padding.
+    PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                   DepthwiseConvDepthMultiplication::kNoMultiplication,
+                   /*max_padding=*/1>::Run(height_block_number,
+                                           width_block_number, input_block_data,
+                                           scratch_block_data, function_params);
+  }
+};
+
+template <int32 max_padding>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      max_padding> {
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    constexpr int kSymmetricZeroPoint = 128;
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+    // Adjusted so that later conditionals are simplified.
+    const int copy_size_adjusted =
+        trailing_width_padding ? copy_size + 1 : copy_size;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    // This is used to simulate what should happen in registers.
+    int8 tmp_data[16];
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    if (copy_size >= 16) {
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          memcpy(tmp_data + 1, input_block_data + input_block_offset, 15);
+          for (int i = 0; i < 16; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          tmp_data[0] = -input_offset_difference;
+          memcpy(scratch_data, tmp_data, 16);
+          copy_done += 15;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
+                 16);
+          for (int i = 0; i < 16; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
+          memcpy(&scratch_data[start_width + copy_done], tmp_data, 16);
+        }
+
+        const int copy_remaining = copy_size - copy_done;
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          memcpy(tmp_data,
+                 input_block_data + input_block_offset + copy_done -
+                     (16 - copy_remaining),
+                 16);
+          // Shift to select the part that we need.
+          for (int i = 0; i < copy_remaining; ++i) {
+            tmp_data[i] = tmp_data[(16 - copy_remaining) + i];
+          }
+          for (int i = 0; i < 16; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          // Apply padding to remainder, some unnecessary but costless in regs.
+          for (int i = copy_remaining; i < 16; ++i) {
+            tmp_data[i] = -input_offset_difference;
+          }
+          const int final_repeats =
+              width_overall_micro_repeats - (start_width + copy_done) / 4;
+          for (int i = 0; i < final_repeats; ++i) {
+            memcpy(&scratch_data[start_width + copy_done], tmp_data + 4 * i, 4);
+            copy_done += 4;
+          }
+        }
+        memset(scratch_data + start_width + copy_done, -input_offset_difference,
+               kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          memcpy(tmp_data + 1, input_block_data + input_block_offset, 3);
+          for (int i = 0; i < 4; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          tmp_data[0] = -input_offset_difference;
+          memcpy(scratch_data, tmp_data, 4);
+          copy_done += 3;
+        }
+
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          memcpy(tmp_data, input_block_data + input_block_offset + copy_done,
+                 4);
+          for (int i = 0; i < 4; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          // Perform as 4 int32 stores, because that is our alignment.
+          memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
+        }
+
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+        const int copy_remaining = copy_size - copy_done;
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          memcpy(tmp_data,
+                 input_block_data + input_block_offset + copy_done -
+                     (4 - copy_remaining),
+                 4);
+          // Shift to select the part that we need.
+          for (int i = 0; i < copy_remaining; ++i) {
+            tmp_data[i] = tmp_data[(4 - copy_remaining) + i];
+          }
+          for (int i = 0; i < 4; ++i) {
+            tmp_data[i] += -kSymmetricZeroPoint;
+          }
+          // Apply padding to remainder, some unnecessary but costless in regs.
+          for (int i = copy_remaining; i < 4; ++i) {
+            tmp_data[i] = -input_offset_difference;
+          }
+          memcpy(&scratch_data[start_width + copy_done], tmp_data, 4);
+          copy_done += 4;
+        }
+        memset(scratch_data + start_width + copy_done, -input_offset_difference,
+               kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (width_overall_micro_repeats == 2) {
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Apply padding by quick fill of whole reg.
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] = -input_offset;
+        }
+        for (int i = 0; i < copy_size; ++i) {
+          // Apply shift-left insert, tmp_data as both operands.
+          // The zero-index byte is left unchanged.
+          for (int i = 7; i > 0; --i) {
+            tmp_data[i] = tmp_data[i - 1];
+          }
+          tmp_data[1] =
+              input_block_data[input_block_offset + (copy_size - 1 - i)];
+        }
+        if (!leading_width_padding) {
+          // Remove leading padding, junking trailing byte, OK because max size
+          // is less than 8.
+          TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
+          for (int i = 0; i < 7; ++i) {
+            tmp_data[i] = tmp_data[i + 1];
+          }
+        }
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] += -kSymmetricZeroPoint;
+        }
+        memcpy(scratch_data_base + scratch_data_offset, tmp_data, 8);
+        memset(scratch_data_base + scratch_data_offset + 8,
+               -input_offset_difference, kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+      // This path is basically the same as the preceding, 2-micro-block one,
+      // but here we simply store fewer bytes.
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Apply padding by quick fill of whole reg.
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] = -input_offset;
+        }
+        for (int i = 0; i < copy_size; ++i) {
+          // Apply shift-left insert, tmp_data as both operands.
+          // The zero-index byte is left unchanged.
+          for (int i = 7; i > 0; --i) {
+            tmp_data[i] = tmp_data[i - 1];
+          }
+          tmp_data[1] =
+              input_block_data[input_block_offset + (copy_size - 1 - i)];
+        }
+        if (!leading_width_padding) {
+          // Remove leading padding, junking trailing byte, OK because max size
+          // is less than 8.
+          TFLITE_DCHECK_LT(copy_size_adjusted + start_width, 8);
+          for (int i = 0; i < 7; ++i) {
+            tmp_data[i] = tmp_data[i + 1];
+          }
+        }
+        for (int i = 0; i < 8; ++i) {
+          tmp_data[i] += -kSymmetricZeroPoint;
+        }
+        memcpy(scratch_data_base + scratch_data_offset, tmp_data, 4);
+        memset(scratch_data_base + scratch_data_offset + 4,
+               -input_offset_difference, kWorkspaceExtension);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    if (trailing_height_padding) {
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+};
+// The preceding section is only compiled when kUseUnwound3x3DotProduct versions
+// of templated functions are selected.
+//
+// End of code section containing intermediate code transformation.
+
+#ifdef USE_NEON
 template <>
 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
                       DepthwiseConvDepthMultiplication::kNoMultiplication,
@@ -953,7 +1371,7 @@
                          const uint8* input_block_data,
                          int8* scratch_block_data,
                          const DepthwiseConvDotProdParams* function_params) {
-#if defined(__aarch64__)
+#ifdef __aarch64__
     PreloadInputBlock(input_block_data, function_params);
 #endif
 
@@ -1213,7 +1631,7 @@
                          const uint8* input_block_data,
                          int8* scratch_block_data,
                          const DepthwiseConvDotProdParams* function_params) {
-#if defined(__aarch64__)
+#ifdef __aarch64__
     PreloadInputBlock(input_block_data, function_params);
 #endif
 
@@ -1227,11 +1645,356 @@
 struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
                       DepthwiseConvDepthMultiplication::kUnitInputDepth,
                       /*max_padding=*/1> {
+  static inline void PackMacroBlockIntrinsics(
+      int32 height_block_number, int32 width_block_number,
+      const uint8* input_block_data, int8* scratch_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int block_height = function_params->inbound_block_height;
+    const int residual_width = function_params->residual_width;
+    const int input_height_stride = function_params->input_height_stride;
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    constexpr int kSymmetricZeroPoint = 128;
+
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    const int32 input_offset = function_params->input_offset;
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    // Work through one slice, by row, at a time.
+    int8* scratch_data_base = scratch_block_data;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      copy_block_height -= 1;
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+      input_block_data += input_height_stride;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+    // Adjusted so that later conditionals are simplified.
+    const int copy_size_adjusted =
+        trailing_width_padding ? copy_size + 1 : copy_size;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    int scratch_data_offset = 0;
+    int input_block_offset = 0;
+
+    constexpr uint8 kSignBit = 0x80;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
+    // code. Note the blocks of 4x4 are still interleaved down the depth.
+    int8x16_t work_reg;
+    int8x8_t half_work_reg;
+    int8x8_t padding_mask;
+
+    // Effect subtraction of zero-point = 128 by XOR of sign bit.
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+    const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
+    padding_mask = vdup_n_s8(-1);
+    half_work_reg = vdup_n_s8(0);
+
+    if (copy_size >= 16) {
+      const int copy_remaining = (copy_size + start_width) & 0x7;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          work_reg = vld1q_u8(input_block_data + input_block_offset);
+          work_reg = vextq_s8(padding_reg, work_reg, 15);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          vst1q_s8(scratch_data, work_reg);
+          copy_done += 15;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 16) <= copy_size; copy_done += 16) {
+          work_reg =
+              vld1q_u8(input_block_data + input_block_offset + copy_done);
+          work_reg = veorq_s8(work_reg, sign_bit);
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
+          vst1q_s8(scratch_data + start_width + copy_done, work_reg);
+        }
+
+        if (copy_done + 8 <= copy_size) {
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_done);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+          copy_done += 8;
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg =
+              vld1_u8(input_block_data + input_block_offset + copy_size - 8);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
+          half_work_reg =
+              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
+          vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        }
+
+        // Trailing guard.
+        vst1_s8(scratch_data + start_width + copy_done, half_work_reg);
+        vst1_s8(scratch_data + start_width + copy_done + 8, half_work_reg);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (copy_size >= 4) {
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        // Work through one slice, by row, at a time.
+        int8* scratch_data = scratch_data_base + scratch_data_offset;
+
+        int copy_done = 0;
+
+        // The surrounding condition ensures that we always need at least one
+        // iteration of the main copy loop. In the case of leading width
+        // padding, we unroll this specially.
+        if (leading_width_padding) {
+          half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
+                                        half_work_reg, 0);
+          half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          vst1_lane_8x4(scratch_data, half_work_reg, 0);
+          copy_done += 3;
+        }
+
+        // Main copy loop.
+        for (; (copy_done + 4) <= copy_size; copy_done += 4) {
+          // Important! Most compilation configurations will compile and run
+          // without the reinterpret_cast. Sanitizers may fail silently on
+          // lane-loading, with a obscure bug or mis-feature probably in
+          // unhygienic macro expansion.
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+        }
+
+        TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
+        // Total amount
+        // = copy_size - copy_done + 4 - adjusted_residual_width
+        // = width_overall_micro_repeats * 4 - start_width - copy_done.
+        // Undone micro blocks
+        // = width_overall_micro_repeats - (start_width + copy_done) / 4.
+
+        // Conditional is (copy_remaining > 0 || trailing_width_padding).
+        if (copy_done < copy_size_adjusted) {
+          TFLITE_DCHECK_LT(copy_remaining, 4);
+          // Employ overlapping-load strategy in order to load full register,
+          // but use only part.
+          // This has the advantage of resulting in zeros after shifting.
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
+              half_work_reg, 0);
+
+          half_work_reg =
+              vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
+          half_work_reg =
+              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
+          vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg,
+                        0);
+          copy_done += 4;
+        }
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data + start_width + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 4, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 8, half_work_reg,
+                      0);
+        vst1_lane_8x4(scratch_data + start_width + copy_done + 12,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else if (width_overall_micro_repeats == 2) {
+      // Special case of 1 + 3 + 1, padding + copy + padding.
+      // This is rarely executed in practice.
+      TFLITE_DCHECK_EQ(copy_size, 3);
+      TFLITE_DCHECK_EQ(start_width, 1);
+      TFLITE_DCHECK(leading_width_padding);
+      TFLITE_DCHECK(trailing_width_padding);
+      // ASM should use MOVI 64-bit set.
+      padding_mask = vcreate_u64(~0xffffff00L);
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
+                                         input_block_data + input_block_offset),
+                                     half_work_reg, 1);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 1),
+                         half_work_reg, 2);
+        half_work_reg =
+            vld1_lane_s8(reinterpret_cast<const int8*>(input_block_data +
+                                                       input_block_offset + 2),
+                         half_work_reg, 3);
+        half_work_reg =
+            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
+        vst1_s8(scratch_data_base + scratch_data_offset, half_work_reg);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    } else {
+      TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
+      const int copy_remaining = (copy_size + start_width) & 0x3;
+      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+      if (leading_width_padding) {
+        padding_mask = vset_lane_u8(255, padding_mask, 0);
+      }
+
+      for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+        for (int i = 0; i < copy_size; ++i) {
+          half_work_reg = vshl_n_u64(half_work_reg, 8);
+          half_work_reg = vld1_lane_s8(
+              reinterpret_cast<const int8*>(
+                  input_block_data + input_block_offset + copy_size - 1 - i),
+              half_work_reg, 0);
+        }
+        if (leading_width_padding) {
+          half_work_reg = vshl_n_s64(half_work_reg, 8);
+        }
+        half_work_reg =
+            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+
+        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
+
+        // Trailing guard.
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
+                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
+                      half_work_reg, 0);
+
+        scratch_data_offset += workspace_height_stride;
+        input_block_offset += input_height_stride;
+      }
+    }
+
+    scratch_data_base += copy_block_height * workspace_height_stride;
+
+    if (trailing_height_padding) {
+      memset(scratch_data_base, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_data_base += workspace_height_stride;
+    }
+
+    TFLITE_DCHECK_EQ(
+        scratch_data_base,
+        scratch_block_data + block_height * workspace_height_stride);
+  }
+
   static inline void Run(int32 height_block_number, int32 width_block_number,
                          const uint8* input_block_data,
                          int8* scratch_block_data,
                          const DepthwiseConvDotProdParams* function_params) {
-    TFLITE_CHECK(false);  // TODO(b/127805639): Not yet implemented.
+#ifdef __aarch64__
+    PreloadInputBlock(input_block_data, function_params);
+#endif
+
+    PackMacroBlockIntrinsics(height_block_number, width_block_number,
+                             input_block_data, scratch_block_data,
+                             function_params);
   }
 };
 
@@ -1365,14 +2128,12 @@
           // without the reinterpret_cast. Sanitizers may fail silently on
           // lane-loading, with a obscure bug or mis-feature probably in
           // unhygienic macro expansion.
-          half_work_reg = vld1_lane_s32(
-              reinterpret_cast<const int32*>(input_block_data +
-                                             input_block_offset + copy_done),
-              half_work_reg, 0);
+          half_work_reg =
+              vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
+                            half_work_reg, 0);
           half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
           TFLITE_DCHECK_EQ(copy_done % 4, 0);
-          vst1_lane_s32(reinterpret_cast<int32*>(scratch_data + copy_done),
-                        half_work_reg, 0);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
         }
 
         TFLITE_DCHECK_EQ(copy_remaining, copy_size - copy_done);
@@ -1388,9 +2149,8 @@
           // Employ overlapping-load strategy in order to load full register,
           // but use only part.
           // This has the advantage of resulting in zeros after shifting.
-          half_work_reg = vld1_lane_s32(
-              reinterpret_cast<const int32*>(
-                  input_block_data + input_block_offset + copy_size - 4),
+          half_work_reg = vld1_lane_8x4(
+              input_block_data + input_block_offset + copy_size - 4,
               half_work_reg, 0);
 
           half_work_reg =
@@ -1398,19 +2158,14 @@
 
           half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
           TFLITE_DCHECK_EQ(copy_done % 4, 0);
-          vst1_lane_s32(reinterpret_cast<int32*>(scratch_data + copy_done),
-                        half_work_reg, 0);
+          vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
           copy_done += 4;
         }
         // Trailing guard.
-        vst1_lane_s32(reinterpret_cast<int32*>(scratch_data + copy_done),
-                      half_work_reg, 0);
-        vst1_lane_s32(reinterpret_cast<int32*>(scratch_data + copy_done + 4),
-                      half_work_reg, 0);
-        vst1_lane_s32(reinterpret_cast<int32*>(scratch_data + copy_done + 8),
-                      half_work_reg, 0);
-        vst1_lane_s32(reinterpret_cast<int32*>(scratch_data + copy_done + 12),
-                      half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 4, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 8, half_work_reg, 0);
+        vst1_lane_8x4(scratch_data + copy_done + 12, half_work_reg, 0);
 
         scratch_data_offset += workspace_height_stride;
         input_block_offset += input_height_stride;
@@ -1429,22 +2184,17 @@
 
         half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
         TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
-        vst1_lane_s32(
-            reinterpret_cast<int32*>(scratch_data_base + scratch_data_offset),
-            half_work_reg, 0);
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset, half_work_reg,
+                      0);
 
         // Trailing guard.
-        vst1_lane_s32(reinterpret_cast<int32*>(scratch_data_base +
-                                               scratch_data_offset + 4),
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 4,
                       half_work_reg, 0);
-        vst1_lane_s32(reinterpret_cast<int32*>(scratch_data_base +
-                                               scratch_data_offset + 8),
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 8,
                       half_work_reg, 0);
-        vst1_lane_s32(reinterpret_cast<int32*>(scratch_data_base +
-                                               scratch_data_offset + 12),
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 12,
                       half_work_reg, 0);
-        vst1_lane_s32(reinterpret_cast<int32*>(scratch_data_base +
-                                               scratch_data_offset + 16),
+        vst1_lane_8x4(scratch_data_base + scratch_data_offset + 16,
                       half_work_reg, 0);
 
         scratch_data_offset += workspace_height_stride;
@@ -1463,7 +2213,7 @@
                          const uint8* input_block_data,
                          int8* scratch_block_data,
                          const DepthwiseConvDotProdParams* function_params) {
-#if defined(__aarch64__)
+#ifdef __aarch64__
     PreloadInputBlock(input_block_data, function_params);
 #endif
 
@@ -1810,17 +2560,399 @@
   }
 };
 
-#if defined(USE_NEON)
+// Beginning of code section containing intermediate code transformation.
+//
+// This section is only compiled when kUseUnwound3x3DotProduct versions of
+// templated functions are selected.
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        stride> {
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
+    int8 filter_bank_a_1[4][4];
+    int8 filter_bank_a_2[4][4];
+    int8 filter_bank_b_0[4][4];
+    int8 filter_bank_b_1[4][4];
+    int8 filter_bank_b_2[4][4];
+    // Simulate NEON-register input data concatenation + sub-selection.
+    // Also sub-block, height 3, depth 4, width 4.
+    uint8 output_values[4];  // Sub-block, depth 4.
+    // selected_data has format Depth 4, width 4.
+    int8 left_bank_0[4][4];
+    int8 left_bank_1[4][4];
+    int8 left_bank_2[4][4];
+    int8 right_bank_0[4][4];
+    int8 right_bank_1[4][4];
+    int8 right_bank_2[4][4];
+    memset(right_bank_0[0], 0, 16);
+    memset(right_bank_1[0], 0, 16);
+    memset(right_bank_2[0], 0, 16);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      const int8* filter_block =
+          filter_workspace + shuffled_filter_increment * j_depth;
+
+      memcpy(filter_bank_a_0, filter_block, 16);
+      memcpy(filter_bank_b_0, filter_block + 16, 16);
+      memcpy(filter_bank_a_1, filter_block + 32, 16);
+      memcpy(filter_bank_b_1, filter_block + 48, 16);
+      memcpy(filter_bank_a_2, filter_block + 64, 16);
+      memcpy(filter_bank_b_2, filter_block + 80, 16);
+
+      for (int s = 0; s < 2; ++s) {
+        // Work through one slice, by row, at a time.
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val +
+              depth_micro_stride * j_depth;
+          uint8* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+          const int8* input_data_0 = scratch_data + s * 2 * 8;
+
+          // Load first sub-micro block of data into operational banks.
+          memcpy(left_bank_0[0], input_data_0, 16);
+          memcpy(left_bank_1[0], input_data_0 + workspace_height_stride, 16);
+          memcpy(left_bank_2[0], input_data_0 + 2 * workspace_height_stride,
+                 16);
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                input_data_0 + width_micro_stride * i_width;
+            const bool no_right_block = (output_width - 1) * stride_val < 2;
+
+            // Load next sub-micro block of data.
+            if (!no_right_block) {
+              memcpy(right_bank_0[0], input_data + width_micro_stride, 16);
+              memcpy(right_bank_1[0],
+                     input_data + workspace_height_stride + width_micro_stride,
+                     16);
+              memcpy(
+                  right_bank_2[0],
+                  input_data + 2 * workspace_height_stride + width_micro_stride,
+                  16);
+            }
+
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              // Operate on depth of 4 in batches.
+              for (int d = 0; d < 4; ++d) {
+                int32 acc = 0;
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val = left_bank_0[d][x];
+                  int32 filter_val = filter_bank_a_0[d][x];
+                  acc += filter_val * input_val;
+                }
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val = left_bank_1[d][x];
+                  int32 filter_val = filter_bank_a_1[d][x];
+                  acc += filter_val * input_val;
+                }
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val = left_bank_2[d][x];
+                  int32 filter_val = filter_bank_a_2[d][x];
+                  acc += filter_val * input_val;
+                }
+                acc += bias_data[d];
+                acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                    DepthwiseConvOutputRounding::kUpward>(
+                    acc, output_multiplier, output_shift);
+                acc += output_offset;
+                acc = std::max(acc, output_activation_min);
+                acc = std::min(acc, output_activation_max);
+                output_values[d] = static_cast<uint8>(acc);
+              }
+
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+
+              // Simulate shifting instructions.
+              if (stride_val == 1) {
+                for (int depth_index = 0; depth_index < 4; ++depth_index) {
+                  for (int z = 0; z < 3; ++z) {
+                    left_bank_0[depth_index][z] =
+                        left_bank_0[depth_index][z + 1];
+                    left_bank_1[depth_index][z] =
+                        left_bank_1[depth_index][z + 1];
+                    left_bank_2[depth_index][z] =
+                        left_bank_2[depth_index][z + 1];
+                  }
+                  left_bank_0[depth_index][3] = right_bank_0[depth_index][0];
+                  left_bank_1[depth_index][3] = right_bank_1[depth_index][0];
+                  left_bank_2[depth_index][3] = right_bank_2[depth_index][0];
+                  for (int z = 0; z < 3; ++z) {
+                    right_bank_0[depth_index][z] =
+                        right_bank_0[depth_index][z + 1];
+                    right_bank_1[depth_index][z] =
+                        right_bank_1[depth_index][z + 1];
+                    right_bank_2[depth_index][z] =
+                        right_bank_2[depth_index][z + 1];
+                  }
+                }
+              } else {
+                for (int depth_index = 0; depth_index < 4; ++depth_index) {
+                  for (int z = 0; z < 2; ++z) {
+                    left_bank_0[depth_index][z] =
+                        left_bank_0[depth_index][z + 2];
+                    left_bank_1[depth_index][z] =
+                        left_bank_1[depth_index][z + 2];
+                    left_bank_2[depth_index][z] =
+                        left_bank_2[depth_index][z + 2];
+                  }
+                  left_bank_0[depth_index][2] = right_bank_0[depth_index][0];
+                  left_bank_1[depth_index][2] = right_bank_1[depth_index][0];
+                  left_bank_2[depth_index][2] = right_bank_2[depth_index][0];
+                  left_bank_0[depth_index][3] = right_bank_0[depth_index][1];
+                  left_bank_1[depth_index][3] = right_bank_1[depth_index][1];
+                  left_bank_2[depth_index][3] = right_bank_2[depth_index][1];
+                  for (int z = 0; z < 2; ++z) {
+                    right_bank_0[depth_index][z] =
+                        right_bank_0[depth_index][z + 2];
+                    right_bank_1[depth_index][z] =
+                        right_bank_1[depth_index][z + 2];
+                    right_bank_2[depth_index][z] =
+                        right_bank_2[depth_index][z + 2];
+                  }
+                }
+              }
+            }
+          }
+        }
+        bias_data += bias_increment;
+
+        // Move filter for second sub-block into operational filter.
+        for (int z = 0; z < 4; ++z) {
+          for (int x = 0; x < 4; ++x) {
+            filter_bank_a_0[z][x] = filter_bank_b_0[z][x];
+            filter_bank_a_1[z][x] = filter_bank_b_1[z][x];
+            filter_bank_a_2[z][x] = filter_bank_b_2[z][x];
+          }
+        }
+      }
+    }
+  }
+};
+
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseUnwound3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        stride> {
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank_a_0[4][4];  // Depth 4, width 4.
+    int8 filter_bank_a_1[4][4];
+    int8 filter_bank_a_2[4][4];
+    int8 filter_bank_b_0[4][4];
+    int8 filter_bank_b_1[4][4];
+    int8 filter_bank_b_2[4][4];
+    // Simulate NEON-register input data concatenation + sub-selection.
+    // Also sub-block, height 3, depth 4, width 4.
+
+    int8 input_bank_0[8];
+    int8 input_bank_1[8];
+    int8 input_bank_2[8];
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 1);
+
+    uint8 output_values[2][4];  // Sub-block, depth 4.
+
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank_a_0, filter_workspace, 16);
+      memcpy(filter_bank_b_0, filter_workspace + 16, 16);
+      memcpy(filter_bank_a_1, filter_workspace + 32, 16);
+      memcpy(filter_bank_b_1, filter_workspace + 48, 16);
+      memcpy(filter_bank_a_2, filter_workspace + 64, 16);
+      memcpy(filter_bank_b_2, filter_workspace + 80, 16);
+
+      // Work through one slice, by row, at a time.
+      for (int k_height = 0; k_height < block_height; ++k_height) {
+        const int8* scratch_data =
+            scratch_block_data +
+            workspace_height_stride * k_height * stride_val;
+        uint8* output_data =
+            output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+        memcpy(input_bank_0, scratch_data, 4);
+        memcpy(input_bank_1, scratch_data + workspace_height_stride, 4);
+        memcpy(input_bank_2, scratch_data + 2 * workspace_height_stride, 4);
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : four_over_stride;
+
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 * i_width;
+
+          memcpy(input_bank_0 + 4, input_data + 4, 4);
+          memcpy(input_bank_1 + 4, input_data + workspace_height_stride + 4, 4);
+          memcpy(input_bank_2 + 4, input_data + 2 * workspace_height_stride + 4,
+                 4);
+
+          // Iterate over input width shifts within 4x4 blocks.
+          for (int w = 0; w < output_width; ++w) {
+            constexpr int offset =
+                0;  // Shift input instead of offset in multiply-accumulate.
+
+            {
+              const int s = 0;
+              for (int d = 0; d < 4; ++d) {
+                int32 acc = bias_data[s * 4 + d];
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val_0 = input_bank_0[offset + x];
+                  int32 filter_val_0 = filter_bank_a_0[d][x];
+                  acc += filter_val_0 * input_val_0;
+                  int32 input_val_1 = input_bank_1[offset + x];
+                  int32 filter_val_1 = filter_bank_a_1[d][x];
+                  acc += filter_val_1 * input_val_1;
+                  int32 input_val_2 = input_bank_2[offset + x];
+                  int32 filter_val_2 = filter_bank_a_2[d][x];
+                  acc += filter_val_2 * input_val_2;
+                }
+                acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                    DepthwiseConvOutputRounding::kUpward>(
+                    acc, output_multiplier, output_shift);
+                acc += output_offset;
+                acc = std::max(acc, output_activation_min);
+                acc = std::min(acc, output_activation_max);
+                output_values[s][d] = static_cast<uint8>(acc);
+
+                output_data[s * 4 + d] = output_values[s][d];
+              }
+            }
+            {
+              const int s = 1;
+              for (int d = 0; d < 4; ++d) {
+                int32 acc = bias_data[s * 4 + d];
+                for (int x = 0; x < 4; ++x) {
+                  int32 input_val_0 = input_bank_0[offset + x];
+                  int32 filter_val_0 = filter_bank_b_0[d][x];
+                  acc += filter_val_0 * input_val_0;
+                  int32 input_val_1 = input_bank_1[offset + x];
+                  int32 filter_val_1 = filter_bank_b_1[d][x];
+                  acc += filter_val_1 * input_val_1;
+                  int32 input_val_2 = input_bank_2[offset + x];
+                  int32 filter_val_2 = filter_bank_b_2[d][x];
+                  acc += filter_val_2 * input_val_2;
+                }
+                acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+                    DepthwiseConvOutputRounding::kUpward>(
+                    acc, output_multiplier, output_shift);
+                acc += output_offset;
+                acc = std::max(acc, output_activation_min);
+                acc = std::min(acc, output_activation_max);
+                output_values[s][d] = static_cast<uint8>(acc);
+
+                output_data[s * 4 + d] = output_values[s][d];
+              }
+            }
+
+            // Simulate register shifts.
+            for (int i = 0; i < (8 - stride_val); ++i) {
+              input_bank_0[i] = input_bank_0[i + stride_val];
+              input_bank_1[i] = input_bank_1[i + stride_val];
+              input_bank_2[i] = input_bank_2[i + stride_val];
+            }
+
+            output_data += output_depth;
+          }
+        }
+      }
+      bias_data += 2 * bias_increment;
+      filter_workspace += shuffled_filter_increment;
+    }
+  }
+};
+// The preceding section is only compiled when kUseUnwound3x3DotProduct versions
+// of templated functions are selected.
+//
+// End of code section containing intermediate code transformation.
+
+#ifdef USE_NEON
 template <>
 struct KernelMacroBlock<
     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
     DepthwiseConvDepthMultiplication::kNoMultiplication,
     /*stride=*/1> {
-  // Apply filter to macro block of input data and store results.
-  //
-  // Parameters for repeats and residual sizes are in terms of outputs.
-  //
-  // Requirement: depth_micro_repeats > 0 || residual_depth > 0.
   static inline void KernelMacroBlockIntrinsics(
       const int8* scratch_block_data, const int8* filter_workspace,
       const int32* bias_data, uint8* output_block_data,
@@ -1980,13 +3112,12 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               output_data += depth;
             }
@@ -2070,13 +3201,12 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
               left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
@@ -2139,13 +3269,12 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               output_data += depth;
             }
@@ -2195,13 +3324,12 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               left_bank_0_reg = right_bank_0_reg;
               left_bank_1_reg = right_bank_1_reg;
@@ -2296,13 +3424,12 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
               biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
@@ -2408,7 +3535,7 @@
                 acc_u8_0_0 =
                     vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
 
-                vst1_lane_u8x4(output_data, acc_u8_0_0, 0);
+                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
 
                 biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
                 biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
@@ -2600,9 +3727,8 @@
               acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
               acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
 
-              vst1_lane_u8x4(output_data_base, acc_u8, 0);
-              vst1_lane_u8x4(output_data_base + output_height_stride, acc_u8,
-                             1);
+              vst1_lane_8x4(output_data_base, acc_u8, 0);
+              vst1_lane_8x4(output_data_base + output_height_stride, acc_u8, 1);
 
               left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
               left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
@@ -2643,9 +3769,9 @@
               acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
               acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
 
-              vst1_lane_u8x4(output_data_base + depth, acc_u8, 0);
-              vst1_lane_u8x4(output_data_base + depth + output_height_stride,
-                             acc_u8, 1);
+              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
+              vst1_lane_8x4(output_data_base + depth + output_height_stride,
+                            acc_u8, 1);
 
               left_bank_0_reg = right_bank_0_reg;
               left_bank_1_reg = right_bank_1_reg;
@@ -2732,7 +3858,7 @@
               acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
               acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
 
-              vst1_lane_u8x4(output_data_base, acc_u8, 0);
+              vst1_lane_8x4(output_data_base, acc_u8, 0);
 
               left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
               left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
@@ -2762,7 +3888,7 @@
               acc_u8 = vmax_u8(acc_u8, output_activation_min_vec);
               acc_u8 = vmin_u8(acc_u8, output_activation_max_vec);
 
-              vst1_lane_u8x4(output_data_base + depth, acc_u8, 0);
+              vst1_lane_8x4(output_data_base + depth, acc_u8, 0);
 
               left_bank_0_reg = right_bank_0_reg;
               left_bank_1_reg = right_bank_1_reg;
@@ -2880,28 +4006,24 @@
           int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.
 
           // Load first sub-micro block of data into operational banks.
-          input_bank_a_reg = vld1q_dup_s32(reinterpret_cast<const int32*>(
-              next_input_data));  // Load lane 0, avoiding
-                                  // uninitialized variable.
           input_bank_a_reg =
-              vld1q_lane_s32(reinterpret_cast<const int32*>(
-                                 next_input_data + workspace_height_stride),
-                             input_bank_a_reg, 2);
-          input_bank_b_reg = vld1q_dup_s32(reinterpret_cast<const int32*>(
+              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                // uninitialized variable.
+          input_bank_a_reg = vld1q_lane_8x4(
+              next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+          input_bank_b_reg = vld1q_dup_s8x4(
               next_input_data +
-              2 * workspace_height_stride));  // Load lane 0, avoiding
-                                              // uninitialized variable.
+              2 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
           input_bank_b_reg =
-              vld1q_lane_s32(reinterpret_cast<const int32*>(
-                                 next_input_data + 3 * workspace_height_stride),
+              vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
                              input_bank_b_reg, 2);
-          input_bank_c_reg = vld1q_dup_s32(reinterpret_cast<const int32*>(
+          input_bank_c_reg = vld1q_dup_s8x4(
               next_input_data +
-              4 * workspace_height_stride));  // Load lane 0, avoiding
-                                              // uninitialized variable.
+              4 * workspace_height_stride);  // Load lane 0, avoiding
+                                             // uninitialized variable.
           input_bank_c_reg =
-              vld1q_lane_s32(reinterpret_cast<const int32*>(
-                                 next_input_data + 5 * workspace_height_stride),
+              vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
                              input_bank_c_reg, 2);
 
           int32x4_t acc0;
@@ -2968,40 +4090,32 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               output_data += output_depth;
             }
             // Load next sub-micro block of data.
             input_bank_a_reg =
-                vld1q_lane_s32(reinterpret_cast<const int32*>(next_input_data),
-                               input_bank_a_reg, 1);
-            input_bank_a_reg =
-                vld1q_lane_s32(reinterpret_cast<const int32*>(
-                                   next_input_data + workspace_height_stride),
-                               input_bank_a_reg, 3);
-            input_bank_b_reg = vld1q_lane_s32(
-                reinterpret_cast<const int32*>(next_input_data +
-                                               2 * workspace_height_stride),
-                input_bank_b_reg, 1);
-            input_bank_b_reg = vld1q_lane_s32(
-                reinterpret_cast<const int32*>(next_input_data +
-                                               3 * workspace_height_stride),
-                input_bank_b_reg, 3);
-            input_bank_c_reg = vld1q_lane_s32(
-                reinterpret_cast<const int32*>(next_input_data +
-                                               4 * workspace_height_stride),
-                input_bank_c_reg, 1);
-            input_bank_c_reg = vld1q_lane_s32(
-                reinterpret_cast<const int32*>(next_input_data +
-                                               5 * workspace_height_stride),
-                input_bank_c_reg, 3);
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
 
             {
               acc0 = adjusted_bias_data;
@@ -3060,13 +4174,12 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
               input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
@@ -3132,13 +4245,12 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               output_data += output_depth;
             }
@@ -3200,13 +4312,12 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
               input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
@@ -3235,28 +4346,21 @@
 
             // Load next sub-micro block of data.
             input_bank_a_reg =
-                vld1q_lane_s32(reinterpret_cast<const int32*>(next_input_data),
-                               input_bank_a_reg, 1);
-            input_bank_a_reg =
-                vld1q_lane_s32(reinterpret_cast<const int32*>(
-                                   next_input_data + workspace_height_stride),
-                               input_bank_a_reg, 3);
-            input_bank_b_reg = vld1q_lane_s32(
-                reinterpret_cast<const int32*>(next_input_data +
-                                               2 * workspace_height_stride),
-                input_bank_b_reg, 1);
-            input_bank_b_reg = vld1q_lane_s32(
-                reinterpret_cast<const int32*>(next_input_data +
-                                               3 * workspace_height_stride),
-                input_bank_b_reg, 3);
-            input_bank_c_reg = vld1q_lane_s32(
-                reinterpret_cast<const int32*>(next_input_data +
-                                               4 * workspace_height_stride),
-                input_bank_c_reg, 1);
-            input_bank_c_reg = vld1q_lane_s32(
-                reinterpret_cast<const int32*>(next_input_data +
-                                               5 * workspace_height_stride),
-                input_bank_c_reg, 3);
+                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                               input_bank_b_reg, 1);
+            input_bank_b_reg =
+                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
+                               input_bank_b_reg, 3);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
+                               input_bank_c_reg, 1);
+            input_bank_c_reg =
+                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
+                               input_bank_c_reg, 3);
 
             // Iterate over input width shifts within 4x4 blocks.
             for (int x = 0; x < output_width; ++x) {
@@ -3303,13 +4407,12 @@
               acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
               acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
 
-              vst1q_lane_u8x4(output_data, acc_u8_all, 0);
-              vst1q_lane_u8x4(output_data + output_height_stride, acc_u8_all,
-                              1);
-              vst1q_lane_u8x4(output_data + 2 * output_height_stride,
-                              acc_u8_all, 2);
-              vst1q_lane_u8x4(output_data + 3 * output_height_stride,
-                              acc_u8_all, 3);
+              vst1q_lane_8x4(output_data, acc_u8_all, 0);
+              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
+              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
+                             2);
+              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
+                             3);
 
               input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
               input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
@@ -3363,17 +4466,15 @@
             int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
 
             // Load first sub-micro block of data into operational banks.
-            input_bank_a_reg = vld1q_dup_s32(reinterpret_cast<const int32*>(
-                next_input_data));  // Load lane 0, avoiding uninitialized
-                                    // variable.
             input_bank_a_reg =
-                vld1q_lane_s32(reinterpret_cast<const int32*>(
-                                   next_input_data + workspace_height_stride),
-                               input_bank_a_reg, 2);
-            input_bank_b_reg = vld1q_dup_s32(reinterpret_cast<const int32*>(
+                vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
+                                                  // uninitialized variable.
+            input_bank_a_reg = vld1q_lane_8x4(
+                next_input_data + workspace_height_stride, input_bank_a_reg, 2);
+            input_bank_b_reg = vld1q_dup_s8x4(
                 next_input_data +
-                2 * workspace_height_stride));  // Load lane 0, avoiding
-                                                // uninitialized variable.
+                2 * workspace_height_stride);  // Load lane 0, avoiding
+                                               // uninitialized variable.
 
             for (int i_width = 0; i_width < output_width_overall_micro_repeats;
                  ++i_width) {
@@ -3382,17 +4483,14 @@
                   i_width == output_width_micro_repeats ? residual_width : 4;
 
               // Load next sub-micro block of data.
-              input_bank_a_reg = vld1q_lane_s32(
-                  reinterpret_cast<const int32*>(next_input_data),
-                  input_bank_a_reg, 1);
               input_bank_a_reg =
-                  vld1q_lane_s32(reinterpret_cast<const int32*>(
-                                     next_input_data + workspace_height_stride),
+                  vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
+              input_bank_a_reg =
+                  vld1q_lane_8x4(next_input_data + workspace_height_stride,
                                  input_bank_a_reg, 3);
-              input_bank_b_reg = vld1q_lane_s32(
-                  reinterpret_cast<const int32*>(next_input_data +
-                                                 2 * workspace_height_stride),
-                  input_bank_b_reg, 1);
+              input_bank_b_reg =
+                  vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
+                                 input_bank_b_reg, 1);
               // Iterate over input width shifts within 4x4 blocks.
               for (int x = 0; x < output_width; ++x) {
                 int32x4_t acc = adjusted_bias_data;
@@ -3420,9 +4518,7 @@
                 acc_u8_0_0 =
                     vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
 
-                // if (x==1) {
-                vst1_lane_u8x4(output_data, acc_u8_0_0, 0);
-                // }
+                vst1_lane_8x4(output_data, acc_u8_0_0, 0);
 
                 input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
                 input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
@@ -3457,19 +4553,450 @@
     DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
     DepthwiseConvDepthMultiplication::kUnitInputDepth,
     /*stride=*/2> {
+  static inline void KernelMacroBlockIntrinsics(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, uint8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int output_depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    const int bias_increment = function_params->bias_increment;
+
+    const int32 output_activation_min =
+        function_params->quantized_activation_min;
+    const int32 output_activation_max =
+        function_params->quantized_activation_max;
+    const int32 output_multiplier = function_params->output_multiplier;
+    const int32 output_shift = function_params->output_shift;
+    const int32 output_offset = function_params->output_offset;
+    TFLITE_DCHECK_GE(output_activation_min, 0);
+    TFLITE_DCHECK_LT(output_activation_min, 256);
+    TFLITE_DCHECK_GE(output_activation_max, 0);
+    TFLITE_DCHECK_LT(output_activation_max, 256);
+    TFLITE_DCHECK_GE(output_offset, -32878);
+    TFLITE_DCHECK_LT(output_offset, 32768);
+
+    TFLITE_DCHECK_GE(depth_micro_repeats, 1);
+    TFLITE_DCHECK_EQ(bias_increment, 4);
+
+    const int16x8_t output_offset_vec =
+        vdupq_n_s16(static_cast<int16>(output_offset));
+    const uint8x16_t output_activation_min_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_min));
+    const uint8x16_t output_activation_max_vec =
+        vdupq_n_u8(static_cast<uint8>(output_activation_max));
+
+    for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
+      int8x16_t filter_reg_0_a;
+      int8x16_t filter_reg_0_b;
+      int8x16_t filter_reg_1_a;
+      int8x16_t filter_reg_1_b;
+      int8x16_t filter_reg_2_a;
+      int8x16_t filter_reg_2_b;
+
+      filter_reg_0_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_0_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_1_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_a = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+      filter_reg_2_b = vld1q_s8(filter_workspace);
+      filter_workspace += 16;
+
+      TFLITE_DCHECK_EQ(bias_increment, 4);
+      const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+      const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
+      bias_data += bias_increment;
+
+      if (block_height == 2) {
+        const int8* scratch_data = scratch_block_data;
+        uint8* output_data = output_block_data + 8 * j_depth;
+
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
+        int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_b_reg = vld1q_lane_8x4(
+            scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
+        input_bank_c_reg = vld1q_dup_s8x4(
+            scratch_data +
+            4 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width = i_width == output_width_micro_repeats
+                                       ? residual_width
+                                       : four_over_stride;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
+          input_bank_c_reg = vld1q_lane_8x4(
+              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                          1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+            acc1 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);
+
+            acc0 = adjusted_bias_data_s_1;
+            acc1 = adjusted_bias_data_s_1;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);
+
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
+            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
+                          1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);
+
+            output_data += output_depth;
+          }
+        }
+      } else {
+        TFLITE_DCHECK_EQ(block_height, 1);
+        // Work through one slice, by row, at a time.
+        const int8* scratch_data = scratch_block_data;
+        uint8* output_data = output_block_data + 8 * j_depth;
+
+        //
+        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
+        int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.
+
+        // Load first sub-micro block of data into operational banks.
+        input_bank_a_reg =
+            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+        input_bank_a_reg = vld1q_lane_8x4(
+            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
+        input_bank_b_reg = vld1q_dup_s8x4(
+            scratch_data +
+            2 * workspace_height_stride);  // Load lane 0, avoiding
+                                           // uninitialized variable.
+
+        int32x4_t acc0;
+        int32x4_t acc1;
+
+        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+             ++i_width) {
+          const int output_width =
+              i_width == output_width_micro_repeats ? residual_width : 2;
+
+          TFLITE_DCHECK_LE(output_width, 2);
+          TFLITE_DCHECK_GE(output_width, 1);
+          TFLITE_DCHECK_LE(output_width * stride_val, 4);
+          const int8* input_data = scratch_data + 4 + 4 * i_width;
+
+          // Load next sub-micro block of data.
+          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
+          input_bank_a_reg = vld1q_lane_8x4(
+              input_data + workspace_height_stride, input_bank_a_reg, 3);
+          input_bank_b_reg = vld1q_lane_8x4(
+              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
+
+          int16x8_t acc_s16_0_1;
+          uint8x8_t acc_u8_0_1;
+
+          // Iterate over input width shifts within 4x4 blocks.
+          {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+
+            output_data += output_depth;
+          }
+          if (output_width == 2) {
+            acc0 = adjusted_bias_data_s_0;
+
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
+            acc0 =
+                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
+
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc0, -output_shift);
+
+            // Second sub-block accumulation.
+            acc1 = adjusted_bias_data_s_1;
+
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
+            acc1 =
+                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);
+
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
+                acc1, -output_shift);
+
+            // Add the output offset.
+            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
+            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
+            // Apply the activation function.
+            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
+            acc_u8_0_1 =
+                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
+            acc_u8_0_1 =
+                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));
+
+            // This stores the results for both sub-blocks together.
+            vst1_u8(output_data, acc_u8_0_1);
+
+            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
+            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
+
+            output_data += output_depth;
+          }
+        }
+      }
+    }
+  }
+
   static inline void Run(const int8* scratch_block_data,
                          const int8* filter_workspace, const int32* bias_data,
                          uint8* output_block_data,
                          const DepthwiseConvDotProdParams* function_params) {
-    TFLITE_CHECK(false);  // TODO(b/127805639): Not yet implemented.
+    KernelMacroBlockIntrinsics(scratch_block_data, filter_workspace, bias_data,
+                               output_block_data, function_params);
   }
 };
 
-#undef vst1_lane_u8x4
-#undef vst1q_lane_u8x4
+#undef vst1_lane_8x4
+#undef vst1q_lane_8x4
 #undef vld1q_lane_s8x8
+#undef vld1_lane_8x4
+#undef vld1q_lane_8x4
+#undef vld1q_dup_s8x4
 
-#endif
+#endif  //  USE_NEON
 
 }  // namespace depthwise_conv
 }  // namespace optimized_ops
diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
index 3e71d6c..87ef4ec 100644
--- a/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -13,9 +13,6 @@
 limitations under the License.
 ==============================================================================*/
 
-// Copied from tensorflow/core/kernels/eigen_spatial_convolutions.h.
-// TODO(petewarden) - move this to a common location in Eigen itself.
-
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
 
@@ -39,1482 +36,6 @@
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #endif
 
-namespace Eigen {
-
-namespace internal {
-
-// WARNING: Most of the code here implicitly assumes that the matrix is in
-// ColMajor layout. This is guaranteed by the tensor contraction (see
-// TensorContraction.h).
-//
-// Inside Eigen a tensor contraction is represented by a matrix multiplication.
-// We don't want to actually extract image patches and reshape the result into
-// a matrix (this involves allocating huge extra memory), so the patch
-// extraction and reshape operations are implicit.
-//
-// TensorContractionInputMapper takes a matrix index and returns the coefficient
-// (or the packet) of the "virtual tensor", that would be at that index if we
-// were to actually reshape the result of patch extraction.
-//
-// TensorContractionSubMapper provides a similar view into the "virtual matrix"
-// at the given vertical and horizontal offsets.
-//
-// "Virtual matrix" dimensions:
-//   *0: kernelChannels * kernelRows * kernelCols;
-//    1: out_height * out_width; * OTHERS (e.g batches, etc...)
-//
-// *) extracted patches are continuous in memory (innermost dimension assuming
-//    col major layout)
-//
-// With this dimensions:
-//   row - offset within a single patch (in code: patchId)
-//   col - index of the extracted patch (in code: patchIndex)
-//         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
-//
-// TODO(ezhulenev): Consolidate this part of the code with the image patch
-// extraction code since they are both very similar.
-
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar_, typename Index,
-          typename nocontract_t, typename contract_t, int Side, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper<
-    Scalar_, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension,
-                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-    inner_dim_reordered, Alignment> {
- public:
-  typedef Scalar_ Scalar;
-
-  typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      Self;
-
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      SubMapper;
-
-  typedef SubMapper VectorMapper;
-  typedef SubMapper LinearMapper;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  EIGEN_DEVICE_FUNC
-  TensorContractionInputMapper(
-      const TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>& tensor,
-      const nocontract_t&, const nocontract_t&, const contract_t&,
-      const contract_t&)
-      : m_impl(tensor.impl().impl()) {
-    Index patch_rows;
-    Index patch_depth;
-    if (internal::traits<ArgType>::Layout == ColMajor) {
-      patch_depth = tensor.impl().dimensions()[0];
-      patch_rows = tensor.impl().dimensions()[1];
-      m_patch_cols = tensor.impl().dimensions()[2];
-      m_num_patches = tensor.impl().dimensions()[3];
-    } else {
-      const size_t NumDims = tensor.impl().dimensions().size();
-      patch_depth = tensor.impl().dimensions()[NumDims - 1];
-      patch_rows = tensor.impl().dimensions()[NumDims - 2];
-      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
-      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
-    }
-
-    // Strides for navigating through the single patch.
-    m_patch_row_stride = patch_depth;
-    m_patch_col_stride = patch_rows * m_patch_row_stride;
-
-    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
-    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
-
-    m_colStride = patch_rows;
-
-    m_outputRows = tensor.impl().outputRows();
-    m_row_strides = tensor.impl().userRowStride();
-    m_col_strides = tensor.impl().userColStride();
-
-    m_in_row_strides = tensor.impl().userInRowStride();
-    m_in_col_strides = tensor.impl().userInColStride();
-
-    if (internal::traits<ArgType>::Layout == ColMajor) {
-      m_inputRows = tensor.impl().impl().dimensions()[1];
-      m_inputCols = tensor.impl().impl().dimensions()[2];
-    } else {
-      const int NumDims = tensor.impl().impl().dimensions().size();
-      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
-      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
-    }
-
-    m_rowInputStride = patch_depth;
-    m_colInputStride = patch_depth * m_inputRows;
-    m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
-
-    m_rowPaddingTop = tensor.impl().rowPaddingTop();
-    m_colPaddingLeft = tensor.impl().colPaddingLeft();
-
-    m_fastPatchRowStride =
-        internal::TensorIntDivisor<Index>(m_patch_row_stride);
-    m_fastPatchColStride =
-        internal::TensorIntDivisor<Index>(m_patch_col_stride);
-    m_fastInputRowStride =
-        internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
-    m_fastInputColStride =
-        internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
-    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
-    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
-    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
-    m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
-  }
-
-  EIGEN_DEVICE_FUNC
-  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper)
-      : m_impl(base_mapper.m_impl) {
-    m_patch_cols = base_mapper.m_patch_cols;
-    m_num_patches = base_mapper.m_num_patches;
-
-    m_patch_row_stride = base_mapper.m_patch_row_stride;
-    m_patch_col_stride = base_mapper.m_patch_col_stride;
-
-    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
-    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
-
-    m_colStride = base_mapper.m_colStride;
-
-    m_rowInputStride = base_mapper.m_rowInputStride;
-    m_colInputStride = base_mapper.m_colInputStride;
-    m_patchInputStride = base_mapper.m_patchInputStride;
-
-    m_inputRows = base_mapper.m_inputRows;
-    m_inputCols = base_mapper.m_inputCols;
-
-    m_outputRows = base_mapper.m_outputRows;
-    m_row_strides = base_mapper.m_row_strides;
-    m_col_strides = base_mapper.m_col_strides;
-
-    m_in_row_strides = base_mapper.m_in_row_strides;
-    m_in_col_strides = base_mapper.m_in_col_strides;
-
-    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
-    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
-
-    m_fastPatchRowStride = base_mapper.m_fastPatchRowStride;
-    m_fastPatchColStride = base_mapper.m_fastPatchColStride;
-    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
-    m_fastInputColStride = base_mapper.m_fastInputColStride;
-    m_fastNumPatches = base_mapper.m_fastNumPatches;
-    m_fastColStride = base_mapper.m_fastColStride;
-    m_fastOutputRows = base_mapper.m_fastOutputRows;
-    m_fastDimZero = base_mapper.m_fastDimZero;
-  }
-
-  // If true, turns off some optimizations for loading packets since the image
-  // patches are "non-standard" such as there are non-trivial strides or
-  // inflations in the input.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
-    return m_in_row_strides != 1 || m_in_col_strides != 1 ||
-           m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
-    return SubMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
-    return LinearMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
-    return loadCoeff(row, rowIndex, colIndex, otherIndex);
-  }
-
-  // Load the coefficient at the patchIndex location instead of the usual
-  // m_rowIndex,
-  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
-  // EIGEN_DEVICE_FUNC
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
-    return loadCoeff(row, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
-    return loadPacket(row, rowIndex, colIndex, otherIndex);
-  }
-
-  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
-  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
-    return loadPacket(row, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const {
-    return m_impl;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
-
- private:
-  friend class TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>;
-
-  // Load coefficient from a patch specified by the "within patch offset"
-  // (patchId) and the precomputed indices of the first element of the patch.
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex,
-                                       Index colIndex, Index otherIndex) const {
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index inputCol = colIndex + colOffset * m_in_col_strides;
-    const Index origInputCol =
-        (m_patch_col_inflate_strides == 1)
-            ? inputCol
-            : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
-
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
-    const Index origInputRow =
-        (m_patch_row_inflate_strides == 1)
-            ? inputRow
-            : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
-    if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
-        origInputRow >= m_inputRows ||
-        (inputCol != origInputCol * m_patch_col_inflate_strides) ||
-        (inputRow != origInputRow * m_patch_row_inflate_strides)) {
-      return Scalar(0);
-    }
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + origInputRow * m_rowInputStride +
-                             origInputCol * m_colInputStride + otherIndex;
-    return m_impl.coeff(inputIndex);
-  }
-
-  // This is the same as loadCoeff(...), but optimized for all `inflate_strides`
-  // and `in_strides` equal to 1 (template specialization without templates).
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex,
-                                               Index colIndex,
-                                               Index otherIndex) const {
-    eigen_assert(!nonStandardPatches());
-
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputCol = colIndex + colOffset;
-    const Index inputRow = rowIndex + rowOffset;
-    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 ||
-        inputRow >= m_inputRows) {
-      return Scalar(0);
-    }
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + inputRow * m_rowInputStride +
-                             inputCol * m_colInputStride + otherIndex;
-    return m_impl.coeff(inputIndex);
-  }
-
-  // Load packet from a patch specified by the "within patch offset"
-  // (patchId) and the precomputed indices of the first element of the patch.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex,
-                                        Index colIndex,
-                                        Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
-
-    if (nonStandardPatches()) {
-      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
-    }
-    return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex,
-                                                Index colIndex,
-                                                Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
-
-    eigen_assert(!nonStandardPatches());
-
-    if ((patchDepth() % packetSize) == 0) {
-      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
-    } else {
-      // Offsets and input calculation here are identical to
-      // loadCoeffStandard(...), but repeated twice.
-
-      const Index patchOffsets[2] = {
-          patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
-
-      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride,
-                                   patchOffsets[1] / m_fastColStride};
-      const Index inputCols[2] = {colIndex + colOffsets[0],
-                                  colIndex + colOffsets[1]};
-      if (inputCols[0] >= m_inputCols || inputCols[1] < 0) {
-        // all zeros
-        return internal::pset1<Packet>(Scalar(0));
-      }
-
-      if (inputCols[0] == inputCols[1]) {
-        const Index rowOffsets[2] = {
-            patchOffsets[0] - colOffsets[0] * m_colStride,
-            patchOffsets[1] - colOffsets[1] * m_colStride};
-        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
-        const Index inputRows[2] = {rowIndex + rowOffsets[0],
-                                    rowIndex + rowOffsets[1]};
-
-        if (inputRows[0] >= m_inputRows || inputRows[1] < 0) {
-          // all zeros
-          return internal::pset1<Packet>(Scalar(0));
-        }
-
-        if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
-          // no padding
-          const Index depth = patchId - patchOffsets[0] * patchDepth();
-          const Index inputIndex = depth + inputRows[0] * m_rowInputStride +
-                                   inputCols[0] * m_colInputStride + otherIndex;
-          return m_impl.template packet<Unaligned>(inputIndex);
-        }
-      }
-    }
-    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex,
-                                            Index colIndex,
-                                            Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols);
-
-    eigen_assert(!nonStandardPatches());
-    eigen_assert((patchDepth() % packetSize) == 0);
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-    eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset);
-
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputCol = colIndex + colOffset;
-    const Index inputRow = rowIndex + rowOffset;
-    if (inputCol < 0 || inputRow < 0 || inputCol >= m_inputCols ||
-        inputRow >= m_inputRows) {
-      // all zeros
-      return internal::pset1<Packet>(Scalar(0));
-    }
-    // no padding
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + inputRow * m_rowInputStride +
-                             inputCol * m_colInputStride + otherIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(
-      Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
-    const int packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_MAX
-    typename internal::remove_const<Scalar>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = loadCoeff(patchId + i, rowIndex, colIndex, otherIndex);
-    }
-    Packet rslt = internal::pload<Packet>(values);
-    return rslt;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(
-      Index patchIndex, Index& rowIndex, Index& colIndex,
-      Index& otherIndex) const {
-    const size_t NumInputDims = array_size<
-        typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
-    const Index patch2DIndex = (NumInputDims == 3)
-                                   ? patchIndex
-                                   : (patchIndex - otherIndex * m_num_patches);
-    otherIndex *= m_patchInputStride;
-    colIndex = patch2DIndex / m_fastOutputRows;
-    rowIndex = patch2DIndex - colIndex * m_outputRows;
-    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
-    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
-  }
-
-  Index m_patch_cols;   // number of columns in the patch
-  Index m_num_patches;  // number of patches to extract.
-
-  // Strides for navigating through the single patch.
-  Index m_patch_row_stride;
-  Index m_patch_col_stride;
-  internal::TensorIntDivisor<Index> m_fastPatchRowStride;
-  internal::TensorIntDivisor<Index> m_fastPatchColStride;
-
-  Index m_patch_row_inflate_strides;  // the strides for row inflation in the
-                                      // image patch
-  Index m_patch_col_inflate_strides;  // the strides for col inflation in the
-                                      // image patch
-  // Fast representation of inflation strides.
-  internal::TensorIntDivisor<Index> m_fastInputRowStride;
-  internal::TensorIntDivisor<Index> m_fastInputColStride;
-
-  Index m_otherStride;
-  Index m_colStride;
-  internal::TensorIntDivisor<Index> m_fastNumPatches;
-  internal::TensorIntDivisor<Index> m_fastColStride;
-
-  Index m_rowInputStride;    // row stride in the input tensor
-  Index m_colInputStride;    // col stride in the input tensor
-  Index m_patchInputStride;  // patch stride in the input tensor
-
-  Index m_inputRows;  // Number of rows in the input tensor
-  Index m_inputCols;  // Number of cols in the input tensor
-
-  Index m_outputRows;  // Number of patch rows
-
-  Index m_row_strides;  // User specified row stride
-  Index m_col_strides;  // User specified col stride
-
-  Index m_in_row_strides;  // User specified input row stride
-  Index m_in_col_strides;  // User specified input col stride
-
-  Index m_rowPaddingTop;   // Row padding
-  Index m_colPaddingLeft;  // Column padding
-
-  internal::TensorIntDivisor<Index> m_fastOutputRows;
-  internal::TensorIntDivisor<Index> m_fastDimZero;
-
-  const TensorEvaluator<ArgType, Device> m_impl;
-};
-
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, int Side, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionSubMapper<
-    Scalar, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension,
-                                const TensorImagePatchOp<Rows, Cols, ArgType> >,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-    inner_dim_reordered, Alignment> {
- public:
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
-  typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      ParentMapper;
-
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      Self;
-
-  typedef Self LinearMapper;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
-      const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_depth_offset(vert_offset),
-        m_col_offset(horiz_offset),
-        m_base_mapper(base_mapper) {
-    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
-                                     m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
-      const Self& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
-        m_col_offset(horiz_offset + base_mapper.m_col_offset),
-        m_base_mapper(base_mapper.m_base_mapper) {
-    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
-                                     m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
-    return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex,
-                                   m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i,
-                                                          Index j) const {
-    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-    return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex,
-                                    m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i,
-                                                          Index j) const {
-    return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
-                                                        j + m_col_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
-  loadCoeffStandard(Index i) const {
-    return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex,
-                                           m_colIndex, m_otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
-    return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex,
-                                        m_colIndex, m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet
-  loadPacketStandard(Index i) const {
-    return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex,
-                                            m_colIndex, m_otherIndex);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC bool aligned(Index) const {
-    return false;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
-    return m_base_mapper.nonStandardPatches();
-  }
-
-  // Max(Col|Row|Depth): compute the upper limit for the column, row and depth
-  // index respectively that fits into the peeled_k elements starting at
-  // m_depth_offset.
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
-    const Index max_col =
-        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) /
-        fastPatchColStride();
-    return std::min<Index>(1 + max_col, patchCols());
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
-                                   const Index col) const {
-    const Index max_row = (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) -
-                           col * patchColStride()) /
-                          fastPatchRowStride();
-    return std::min<Index>(1 + max_row, patchRows());
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col,
-                                     Index row) const {
-    const Index max_depth = m_depth_offset + peeled_k -  //
-                            col * patchColStride() -     //
-                            row * patchRowStride();
-    return std::min<Index>(max_depth, patchDepth());
-  }
-
-  // MaxDepth uses only the remaining number of elements in the peeled_k.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
-                                     const Index start_depth) const {
-    return std::min<Index>(start_depth + num_elements, patchDepth());
-  }
-
-  // Every register matters in this code, so sometimes to prevent register
-  // spilling, instead of the variable that you would expect to see, we use
-  // another one, that is guaranteed to have the same value. E.g. patch depth is
-  // always the same as input depth, and it's also the same as input row stride.
-  // Bunch of other parameters have similar relations.
-
-  typedef internal::TensorIntDivisor<Index> IndexDivisor;
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchDepth() const {
-    return m_base_mapper.m_rowInputStride;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRows() const {
-    return m_base_mapper.m_colStride;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchCols() const {
-    return m_base_mapper.m_patch_cols;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRowStride() const {
-    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
-                 "Patch depth must be equal to patch row stride.");
-    return patchDepth();
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchColStride() const {
-    return m_base_mapper.m_patch_col_stride;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const {
-    eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride &&
-                 "Patch depth must be equal to patch row stride.");
-    return m_base_mapper.m_fastDimZero;  // patch_depth
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const {
-    return m_base_mapper.m_fastPatchColStride;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
-                                             const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
-                                            const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_base_mapper.m_impl.coeff(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
-    const Index r = m_rowIndex + row;
-    return r < 0 || r >= m_base_mapper.m_inputRows;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row,
-                                     const Index last_row) const {
-    return m_rowIndex + first_row < 0 ||
-           m_rowIndex + last_row >= m_base_mapper.m_inputRows;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
-    const Index c = m_colIndex + col;
-    return c < 0 || c >= m_base_mapper.m_inputCols;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
-    const Index r = m_rowIndex + row;
-    const Index c = m_colIndex + col;
-    return r * m_base_mapper.m_rowInputStride +
-           c * m_base_mapper.m_colInputStride + m_otherIndex;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index rowStride() const {
-    return m_base_mapper.m_row_strides;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index colStride() const {
-    return m_base_mapper.m_col_strides;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index rowOffset() const {
-    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
-    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
-    return patchOffset - colOffset * m_base_mapper.m_colStride;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index colOffset() const {
-    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
-    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
-    return colOffset;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index depthOffset() const {
-    return m_depth_offset % patchDepth();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper
-  getLinearMapper(Index i, Index j) const {
-    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
-  }
-
- private:
-  Index m_depth_offset;  // First row in the input matrix
-  Index m_col_offset;    // First col in the input matrix
-
-  // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
-  // indices for the first element in a patch specified by col_offset
-  // (see computeBaseIndices(...) for details).
-  Index m_rowIndex;
-  Index m_colIndex;
-  Index m_otherIndex;
-
-  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
-                                     // performs better in benchmarks.
-};
-
-// Arrange a block of the right input matrix (in our case it's always a "virtual
-// matrix" constructed from extracted image patches) in contiguous memory.
-//
-// Given column major input (A0 beside A1 in memory):
-// A0 B0 C0 D0  E0 F0 G0 H0 ... Z0
-// A1 B1 C1 D1  E1 F1 G1 H1 ... Z1
-// A2 B2 C2 D2  E2 F2 G2 H2 ... Z2
-// A3 B3 C3 D3  E3 F3 G3 H3 ... Z3
-// A4 B4 C4 D4  E4 F4 G4 H4 ... Z4
-// A5 B5 C5 D5  E5 F5 G5 H5 ... Z5
-// A6 B6 C6 D6  E6 F6 G6 H6 ... Z6
-// A7 B7 C7 D7  E7 F7 G7 H7 ... Z7
-// A8 ...
-// ...
-//
-// *) A, B, C, ... - patches extracted from the original input.
-// *) A0, A1, A2 ... - values from the same patch at different offsets.
-//
-// The traversal (packed rhs memory) order (B0 besides A0 in memory):
-// A0 B0 C0 D0 A1 B1 C1 D1 ...
-// E0 F0 G0 H0 E1 F1 G1 H1 ...
-// ...
-// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4)
-//
-// This traversal order must be the same as in default gemm_pack_rhs defined in
-// GeneralBlockPanelKernel.h.
-//
-// *) nr - number of registers along the 'n' dimension.
-//    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
-//    Multiplication" paper.
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          int nr>
-struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<
-                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-            Device>,
-        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-        inner_dim_reordered, Alignment>,
-    nr, ColMajor, false, false> {
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
-      inner_dim_reordered, Alignment>
-      SubMapper;
-  typedef SubMapper DataMapper;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    const Index packet_cols4 = (cols / 4) * 4;
-    const Index peeled_k = (depth / packet_size) * packet_size;
-    const bool non_standard_patches = rhs.nonStandardPatches();
-
-    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      Index k = 0;
-      if ((packet_size % 4) == 0 && !non_standard_patches) {
-        // FAST PATH:
-        // Iterate over patch columns and rows, if we know that a single
-        // packet do not span across multiple rows or columns.
-        if ((rhs.patchDepth() % packet_size) == 0) {
-          const Index start_col = rhs.colOffset();
-          const Index max_col = rhs.maxCol(peeled_k);
-
-          for (Index c = start_col; c < max_col; ++c) {
-            eigen_assert(k <= peeled_k);
-
-            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
-            const Index max_row = rhs.maxRow(peeled_k, c);
-
-            const bool pad_col0 = dm0.padCol(c);
-            const bool pad_col1 = dm1.padCol(c);
-            const bool pad_col2 = dm2.padCol(c);
-            const bool pad_col3 = dm3.padCol(c);
-
-            // Check if we can squeeze reads along the `row` and `depth`
-            // dimensions (two innermost dimensions).
-            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
-                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
-                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
-                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
-                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
-              // Compute how many elements we can squeeze read.
-              const Index start_depth =
-                  (c == start_col) ? rhs.depthOffset() : 0;
-
-              // Upper bound for the number of elements in the depth dimension
-              // that we can squeeze read.
-              const Index squeeze_length =
-                  (max_row - start_row) * rhs.patchDepth() - start_depth;
-
-              // Do not overshoot beyond the block size.
-              const Index max_depth =
-                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              const Index idx0 = dm0.baseIndex(start_row, c);
-              const Index idx1 = dm1.baseIndex(start_row, c);
-              const Index idx2 = dm2.baseIndex(start_row, c);
-              const Index idx3 = dm3.baseIndex(start_row, c);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 4> kernel;
-                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
-                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
-                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
-                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel);
-                pstoreu(block + 0 * packet_size, kernel.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel.packet[1]);
-                pstoreu(block + 2 * packet_size, kernel.packet[2]);
-                pstoreu(block + 3 * packet_size, kernel.packet[3]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-
-              // Go to the next column.
-              continue;
-            }
-
-            // If we can't squeeze reads, process rows one by one.
-            for (Index r = start_row; r < max_row; ++r) {
-              eigen_assert(k <= peeled_k);
-
-              const bool pad0 = pad_col0 || dm0.padRow(r);
-              const bool pad1 = pad_col1 || dm1.padRow(r);
-              const bool pad2 = pad_col2 || dm2.padRow(r);
-              const bool pad3 = pad_col3 || dm3.padRow(r);
-
-              const Index idx0 = dm0.baseIndex(r, c);
-              const Index idx1 = dm1.baseIndex(r, c);
-              const Index idx2 = dm2.baseIndex(r, c);
-              const Index idx3 = dm3.baseIndex(r, c);
-
-              const Index start_depth = ((c == start_col) && (r == start_row))
-                                            ? rhs.depthOffset()
-                                            : 0;
-              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 4> kernel;
-                kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx0);
-                kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx1);
-                kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx2);
-                kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0))
-                                        : rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel);
-                pstoreu(block + 0 * packet_size, kernel.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel.packet[1]);
-                pstoreu(block + 2 * packet_size, kernel.packet[2]);
-                pstoreu(block + 3 * packet_size, kernel.packet[3]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-            }
-          }
-
-          // The loop above should fill peeled_k elements.
-          eigen_assert(peeled_k == k);
-
-        } else {
-          for (; k < peeled_k; k += packet_size) {
-            PacketBlock<Packet, 4> kernel;
-            kernel.packet[0] = dm0.loadPacketStandard(k);
-            kernel.packet[1] = dm1.loadPacketStandard(k);
-            kernel.packet[2] = dm2.loadPacketStandard(k);
-            kernel.packet[3] = dm3.loadPacketStandard(k);
-            ptranspose(kernel);
-            pstoreu(block + 0 * packet_size, kernel.packet[0]);
-            pstoreu(block + 1 * packet_size, kernel.packet[1]);
-            pstoreu(block + 2 * packet_size, kernel.packet[2]);
-            pstoreu(block + 3 * packet_size, kernel.packet[3]);
-            block += 4 * packet_size;
-          }
-        }
-      }
-
-      // Copy the remaining coefficients of the column block after the peeled_k.
-      if (!rhs.nonStandardPatches()) {
-        for (; k < depth; k++) {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      } else {
-        for (; k < depth; k++) {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // copy the remaining columns one at a time (nr==1)
-    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for (Index k = 0; k < depth; k++) {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-
-// Template specialization for packet_size = 2. We must special-case packet
-// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
-          bool inner_dim_reordered, int Alignment, int nr>
-struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<
-                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-            Device>,
-        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
-        Alignment>,
-    nr, ColMajor, false, false> {
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered,
-      Alignment>
-      SubMapper;
-  typedef SubMapper DataMapper;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    const int packet_size = 2;
-    const Index packet_cols4 = (cols / 4) * 4;
-    const Index peeled_k = (depth / packet_size) * packet_size;
-    const bool non_standard_patches = rhs.nonStandardPatches();
-
-    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      Index k = 0;
-      if (!non_standard_patches) {
-        // FAST PATH:
-        // Iterate over patch columns and rows if we know that a single
-        // packet do not span across multiple rows or columns.
-        if ((rhs.patchDepth() % packet_size) == 0) {
-          const Index start_col = rhs.colOffset();
-          const Index max_col = rhs.maxCol(peeled_k);
-
-          for (Index c = start_col; c < max_col; ++c) {
-            eigen_assert(k <= peeled_k);
-
-            const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
-            const Index max_row = rhs.maxRow(peeled_k, c);
-
-            const bool pad_col0 = dm0.padCol(c);
-            const bool pad_col1 = dm1.padCol(c);
-            const bool pad_col2 = dm2.padCol(c);
-            const bool pad_col3 = dm3.padCol(c);
-
-            // We can squeeze reads along the `row` and `depth` dimensions if
-            // the row stride is `1`, which means that `row` and `depth`
-            // dimensions are contiguous (two innermost dimensions).
-            if (rhs.rowStride() == 1 &&                                //
-                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
-                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
-                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
-                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
-                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
-              // Compute how many elements we can squeeze read.
-              const Index start_depth =
-                  (c == start_col) ? rhs.depthOffset() : 0;
-
-              // Upper bound for the number of elements in the depth dimension
-              // that we can squeeze read.
-              const Index squeeze_length =
-                  (max_row - start_row) * rhs.patchDepth() - start_depth;
-
-              // Do not overshoot beyond the block size.
-              const Index max_depth =
-                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              const Index idx0 = dm0.baseIndex(start_row, c);
-              const Index idx1 = dm1.baseIndex(start_row, c);
-              const Index idx2 = dm2.baseIndex(start_row, c);
-              const Index idx3 = dm3.baseIndex(start_row, c);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                PacketBlock<Packet, 2> kernel0;
-                PacketBlock<Packet, 2> kernel1;
-                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
-                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
-                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
-                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel0);
-                ptranspose(kernel1);
-                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
-                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
-                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-
-              // Go to the next column.
-              continue;
-            }
-
-            // If we can't squeeze reads, process rows one by one.
-            for (Index r = start_row; r < max_row; ++r) {
-              eigen_assert(k <= peeled_k);
-
-              const bool pad0 = pad_col0 || dm0.padRow(r);
-              const bool pad1 = pad_col1 || dm1.padRow(r);
-              const bool pad2 = pad_col2 || dm2.padRow(r);
-              const bool pad3 = pad_col3 || dm3.padRow(r);
-
-              const Index idx0 = dm0.baseIndex(r, c);
-              const Index idx1 = dm1.baseIndex(r, c);
-              const Index idx2 = dm2.baseIndex(r, c);
-              const Index idx3 = dm3.baseIndex(r, c);
-
-              const Index start_depth = ((c == start_col) && (r == start_row))
-                                            ? rhs.depthOffset()
-                                            : 0;
-              const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
-              eigen_assert((max_depth - start_depth) % packet_size == 0);
-
-              for (Index d = start_depth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 2> kernel0;
-                PacketBlock<Packet, 2> kernel1;
-                kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx0);
-                kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx1);
-                kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx2);
-                kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0))
-                                         : rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel0);
-                ptranspose(kernel1);
-                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
-                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
-                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
-                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
-                block += 4 * packet_size;
-                k += packet_size;
-              }
-            }
-          }
-
-          // The loop above should fill peeled_k elements.
-          eigen_assert(peeled_k == k);
-
-        } else {
-          // Packet can span multiple rows or columns, so we have to go
-          // though the slower "standard" path.
-          for (; k < peeled_k; k += packet_size) {
-            PacketBlock<Packet, 2> kernel0;
-            PacketBlock<Packet, 2> kernel1;
-            kernel0.packet[0] = dm0.loadPacketStandard(k);
-            kernel0.packet[1] = dm1.loadPacketStandard(k);
-            kernel1.packet[0] = dm2.loadPacketStandard(k);
-            kernel1.packet[1] = dm3.loadPacketStandard(k);
-            ptranspose(kernel0);
-            ptranspose(kernel1);
-            pstoreu(block + 0 * packet_size, kernel0.packet[0]);
-            pstoreu(block + 1 * packet_size, kernel1.packet[0]);
-            pstoreu(block + 2 * packet_size, kernel0.packet[1]);
-            pstoreu(block + 3 * packet_size, kernel1.packet[1]);
-            block += 4 * packet_size;
-          }
-        }
-      }
-
-      // Copy the remaining coefficients of the column block after the peeled_k.
-      if (!non_standard_patches) {
-        for (; k < depth; k++) {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      } else {
-        for (; k < depth; k++) {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // Copy the remaining columns one at a time (nr==1).
-    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for (Index k = 0; k < depth; k++) {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-
-// Special case for non-vectorized types such as float16.
-template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
-          typename Device, typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
-          bool inner_dim_reordered, int Alignment, int nr>
-struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<
-                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-            Device>,
-        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
-        Alignment>,
-    nr, ColMajor, false, false> {
-  typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<
-              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
-          Device>,
-      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered,
-      Alignment>
-      SubMapper;
-  typedef SubMapper DataMapper;
-
-  EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
-                                    Index depth, Index cols, Index stride = 0,
-                                    Index offset = 0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    const Index packet_cols4 = (cols / 4) * 4;
-
-    for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      if (!rhs.nonStandardPatches()) {
-        for (Index k = 0; k < depth; k++) {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      } else {
-        for (Index k = 0; k < depth; k++) {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // Copy the remaining columns one at a time (nr==1).
-    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for (Index k = 0; k < depth; k++) {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-}  // end namespace internal
-
-/** SpatialConvolution
- * \ingroup CXX11_NeuralNetworks_Module
- *
- * \brief Applies a 2D convolution over a multichannel input image.
- *
- * The input parameter is expected to be a tensor with a rank of 3 or more
- * (channels, height, width, and optionally others)
- * The kernel parameter is expected to be a 4D tensor (filters, channels,
- * kernel_height, kernel_width)
- * The input and the kernel must both be in col-major layout. The result will
- * also be in col-major layout.
- *
- * If col_in_stride, row_in_stride > 1, then applies convolution with holes
- * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
- * pixels.
- *
- * The result can be assigned to a tensor of rank equal to the rank of the
- * input. The dimensions of the result will be filters, height, width (and
- * others if applicable).
- *
- * It is possible to swap the order of the width and height dimensions provided
- * that the same order is used in the input, the kernel, and the output.
- *
- * It is also possible to add an output kernel to the contraction, output
- * kernel is called by Eigen when it "finalizes" the block of an output tensor.
- *
- */
-template <typename Input, typename Kernel,
-          typename OutputKernel = const NoOpOutputKernel>
-EIGEN_DEVICE_FUNC
-    EIGEN_ALWAYS_INLINE static const typename internal::conditional<
-        internal::traits<Input>::Layout == ColMajor,
-        TensorReshapingOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorContractionOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            1>,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel>,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
-                const OutputKernel> >,
-        TensorReshapingOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorContractionOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            1>,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel>,
-                const OutputKernel> > >::type
-    SpatialConvolution(const Input& input, const Kernel& kernel,
-                       const Index row_stride = 1, const Index col_stride = 1,
-                       const PaddingType padding_type = PADDING_SAME,
-                       const Index row_in_stride = 1,
-                       const Index col_in_stride = 1,
-                       const OutputKernel& output_kernel = OutputKernel()) {
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
-                   internal::traits<Input>::NumDimensions,
-                   internal::traits<Input>::Layout, TensorIndex> >
-      in(input);
-  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
-                   internal::traits<Kernel>::NumDimensions,
-                   internal::traits<Kernel>::Layout, TensorIndex> >
-      kern(kernel);
-
-  EIGEN_STATIC_ASSERT(
-      internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
-      YOU_MADE_A_PROGRAMMING_MISTAKE)
-  const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-
-  const int NumDims = internal::traits<Input>::NumDimensions;
-
-  // Number of filters to apply. This is the same as the output depth of the
-  // result
-  const TensorIndex kernelFilters =
-      isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
-  // Number of channels. This is the same as the input depth.
-  const TensorIndex kernelChannels =
-      isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
-  const TensorIndex kernelRows =
-      isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
-  const TensorIndex kernelCols =
-      isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
-
-  const Index kernelRowsEff =
-      kernelRows + (kernelRows - 1) * (row_in_stride - 1);
-  const Index kernelColsEff =
-      kernelCols + (kernelCols - 1) * (col_in_stride - 1);
-
-  array<IndexPair<TensorIndex>, 1> contract_dims;
-  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
-
-  const TensorIndex InputRows =
-      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex InputCols =
-      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-
-  TensorIndex out_height;
-  TensorIndex out_width;
-  switch (padding_type) {
-    case PADDING_VALID:
-      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) /
-                                static_cast<float>(row_stride));
-      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) /
-                               static_cast<float>(col_stride));
-      break;
-    case PADDING_SAME:
-      out_height = numext::ceil(InputRows / static_cast<float>(row_stride));
-      out_width = numext::ceil(InputCols / static_cast<float>(col_stride));
-      break;
-    default:
-      // Initialize unused variables to avoid a compiler warning
-      out_height = 0;
-      out_width = 0;
-      eigen_assert(false && "unexpected padding");
-  }
-
-  // Molds the output of the patch extraction code into a 2d tensor:
-  // - the first dimension (dims[0]): the patch values to be multiplied with the
-  // kernels
-  // - the second dimension (dims[1]): everything else
-  DSizes<TensorIndex, 2> pre_contract_dims;
-  if (isColMajor) {
-    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
-    pre_contract_dims[1] = out_height * out_width;
-    for (int i = 3; i < NumDims; ++i) {
-      pre_contract_dims[1] *= in.dimension(i);
-    }
-  } else {
-    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
-    pre_contract_dims[0] = out_height * out_width;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      pre_contract_dims[0] *= in.dimension(i);
-    }
-  }
-
-  // Molds the output of the contraction into the shape expected by the used
-  // (assuming this is ColMajor):
-  // - 1st dim: kernel filters
-  // - 2nd dim: output height
-  // - 3rd dim: output width
-  // - 4th dim and beyond: everything else including batch size
-  DSizes<TensorIndex, NumDims> post_contract_dims;
-  if (isColMajor) {
-    post_contract_dims[0] = kernelFilters;
-    post_contract_dims[1] = out_height;
-    post_contract_dims[2] = out_width;
-    for (int i = 3; i < NumDims; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  } else {
-    post_contract_dims[NumDims - 1] = kernelFilters;
-    post_contract_dims[NumDims - 2] = out_height;
-    post_contract_dims[NumDims - 3] = out_width;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  }
-
-  DSizes<TensorIndex, 2> kernel_dims;
-  if (isColMajor) {
-    kernel_dims[0] = kernelFilters;
-    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
-  } else {
-    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
-    kernel_dims[1] = kernelFilters;
-  }
-  return choose(
-      Cond<internal::traits<Input>::Layout == ColMajor>(),
-      kernel.reshape(kernel_dims)
-          .contract(input
-                        .extract_image_patches(
-                            kernelRows, kernelCols, row_stride, col_stride,
-                            row_in_stride, col_in_stride, padding_type)
-                        .reshape(pre_contract_dims),
-                    contract_dims, output_kernel)
-          .reshape(post_contract_dims),
-      input
-          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                 row_in_stride, col_in_stride, padding_type)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-          .reshape(post_contract_dims));
-}
-
-}  // end namespace Eigen
-
-// clang-format on
+#include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h"
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index d6984e9..deb484b 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -23,6 +23,19 @@
 
 namespace tflite {
 
+void GuardedQuantizeMultiplier(double effective_output_scale,
+                               int32_t* significand, int* shift) {
+  QuantizeMultiplier(effective_output_scale, significand, shift);
+  // Additional guard to make sure RoundingDivideByPOT does not fail.
+  if (*shift < -31) {
+    // If shift is less than -31, RoundingDivideByPOT fails. This happens when
+    // min and max are close and small. For this particular case, both
+    // significand and shift are set to zero.
+    *significand = 0;
+    *shift = 0;
+  }
+}
+
 TfLiteStatus PopulateConvolutionQuantizationParams(
     TfLiteContext* context, const TfLiteTensor* input,
     const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
@@ -66,7 +79,7 @@
                                           static_cast<double>(output_scale);
     int32_t significand;
     int shift;
-    QuantizeMultiplier(effective_output_scale, &significand, &shift);
+    GuardedQuantizeMultiplier(effective_output_scale, &significand, &shift);
     per_channel_multiplier[i] = significand;
     per_channel_shift[i] = shift;
   }
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 57ff65f..423832c 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -92,6 +92,10 @@
     int32_t* output_activation_min, int32_t* output_activation_max,
     int32_t* per_channel_multiplier, int* per_channel_shift);
 
+// QuantizedMultiplier with the guard that shift will not be smaller than -31.
+void GuardedQuantizeMultiplier(double effective_output_scale,
+                               int32_t* significand, int* shift);
+
 // Calculates the multiplication factor for a quantized convolution (or
 // quantized depthwise convolution) involving the given tensors. Returns an
 // error if the scales of the tensors are not compatible.
diff --git a/tensorflow/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
index 4e79254..a31befb 100644
--- a/tensorflow/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -251,6 +251,115 @@
   TfLiteTensorFree(&output);
 }
 
+TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
+  // Create input.
+  TfLiteTensor input;
+  input.type = kTfLiteInt8;
+  input.allocation_type = kTfLiteArenaRw;
+  input.dims = TfLiteIntArrayCreate(1);
+  input.dims->data[0] = 2;
+  TfLiteQuantizationParams input_quant = {1, 5};
+  input.params = input_quant;
+  input.quantization.type = kTfLiteAffineQuantization;
+  auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  input_params->scale = TfLiteFloatArrayCreate(1);
+  input_params->scale->data[0] = 1;
+  input_params->zero_point = TfLiteIntArrayCreate(1);
+  input_params->zero_point->data[0] = 5;
+  input.quantization.params = reinterpret_cast<void*>(input_params);
+
+  // Create filter.
+  TfLiteTensor filter;
+  filter.type = kTfLiteInt8;
+  filter.allocation_type = kTfLiteArenaRw;
+  filter.dims = TfLiteIntArrayCreate(4);
+  filter.dims->data[0] = 3;
+  filter.dims->data[1] = 4;
+  filter.dims->data[2] = 5;
+  filter.dims->data[3] = 6;
+  TfLiteQuantizationParams filter_quant = {4.6566129e-10, 0};
+  filter.params = filter_quant;
+  filter.quantization.type = kTfLiteAffineQuantization;
+  auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  filter_params->scale = TfLiteFloatArrayCreate(3);
+  int32_t two_pow_neg_31 = 0x30000000;  // 2^-31 so shift = -30.
+  int32_t two_pow_neg_32 = 0x2F800000;  // 2^-32 so shift = -31.
+  int32_t two_pow_neg_33 = 0x2F000000;  // 2^-33 so shift = -32.
+  filter_params->scale->data[0] = *reinterpret_cast<float*>(&two_pow_neg_31);
+  filter_params->scale->data[1] = *reinterpret_cast<float*>(&two_pow_neg_32);
+  filter_params->scale->data[2] = *reinterpret_cast<float*>(&two_pow_neg_33);
+  filter_params->zero_point = TfLiteIntArrayCreate(3);
+  filter_params->zero_point->data[0] = 0;
+  filter_params->zero_point->data[1] = 0;
+  filter_params->zero_point->data[2] = 0;
+  filter_params->quantized_dimension = 0;
+  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+
+  // Create bias.
+  TfLiteTensor bias;
+  bias.type = kTfLiteInt32;
+  bias.allocation_type = kTfLiteArenaRw;
+  bias.dims = TfLiteIntArrayCreate(4);
+  TfLiteQuantizationParams bias_quant = {4.6566129e-10, 9};
+  bias.params = bias_quant;
+  bias.quantization.type = kTfLiteAffineQuantization;
+  auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  bias_params->scale = TfLiteFloatArrayCreate(3);
+  bias_params->scale->data[0] = 4.6566129e-10;  // 2^-31
+  bias_params->scale->data[1] = 2.3283064e-10;  // 2^-32
+  bias_params->scale->data[2] = 1.1641532e-10;  // 2^-33
+  bias_params->zero_point = TfLiteIntArrayCreate(3);
+  bias_params->zero_point->data[0] = 11;
+  bias_params->zero_point->data[1] = 12;
+  bias_params->zero_point->data[2] = 15;
+  bias.quantization.params = reinterpret_cast<void*>(bias_params);
+
+  // Create output.
+  TfLiteTensor output;
+  output.type = kTfLiteInt8;
+  output.allocation_type = kTfLiteArenaRw;
+  output.dims = nullptr;
+  TfLiteQuantizationParams output_quant = {1, -128};
+  output.params = output_quant;
+  output.quantization.type = kTfLiteAffineQuantization;
+  auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  output_params->scale = TfLiteFloatArrayCreate(1);
+  output_params->scale->data[0] = 1;
+  output_params->zero_point = TfLiteIntArrayCreate(1);
+  output_params->zero_point->data[0] = -128;
+  output.quantization.params = reinterpret_cast<void*>(output_params);
+
+  // Create call parameters.
+  TfLiteContext context;
+  int32_t multiplier;
+  int shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  std::vector<int32_t> per_channel_multiplier(3);
+  std::vector<int> per_channel_shift(3);
+
+  // Call and verify results for per channel case.
+  EXPECT_EQ(
+      kTfLiteOk,
+      PopulateConvolutionQuantizationParams(
+          &context, &input, &filter, &bias, &output, kTfLiteActRelu,
+          &multiplier, &shift, &output_activation_min, &output_activation_max,
+          per_channel_multiplier.data(), per_channel_shift.data()));
+  EXPECT_THAT(per_channel_multiplier,
+              ::testing::ElementsAre(1073741824, 1073741824, 0));
+  EXPECT_THAT(per_channel_shift, ::testing::ElementsAre(-30, -31, 0));
+
+  // Release.
+  TfLiteTensorFree(&input);
+  TfLiteTensorFree(&filter);
+  TfLiteTensorFree(&bias);
+  TfLiteTensorFree(&output);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 295204f..dd8d9ed 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -124,7 +124,7 @@
 
   CHECK(interpreter_ != nullptr);
 
-  for (int i = 0; i < input_shapes.size(); ++i) {
+  for (size_t i = 0; i < input_shapes.size(); ++i) {
     const int input_idx = interpreter_->inputs()[i];
     if (input_idx == kOptionalTensor) continue;
     const auto& shape = input_shapes[i];
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index ba7eaf6..109c6b0 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -133,6 +133,25 @@
 };
 
 /**
+ * Device types.
+ *
+ * The type of NNAPI device.
+ */
+enum {
+  /** The device type cannot be provided. */
+  ANEURALNETWORKS_DEVICE_UNKNOWN = 0,
+  /** The device does not fall into any category below. */
+  ANEURALNETWORKS_DEVICE_OTHER = 1,
+  /** The device runs NNAPI models on single or multi-core CPU. */
+  ANEURALNETWORKS_DEVICE_CPU = 2,
+  /** The device can run NNAPI models and also accelerate graphics APIs such
+   * as OpenGL ES and Vulkan. */
+  ANEURALNETWORKS_DEVICE_GPU = 3,
+  /** Dedicated accelerator for Machine Learning workloads. */
+  ANEURALNETWORKS_DEVICE_ACCELERATOR = 4,
+};
+
+/**
  * ANeuralNetworksMemory is an opaque type that represents memory.
  *
  * This type is used to represent shared memory, memory mapped files,
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index bbc0c86..6b3de3c 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -170,6 +170,7 @@
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksDevice_getVersion);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
                          ANeuralNetworksDevice_getFeatureLevel);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksDevice_getType);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
                          ANeuralNetworksModel_getSupportedOperationsForDevices);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
index 66a36db..b42c189 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.h
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -695,6 +695,25 @@
       const ANeuralNetworksDevice* device, int64_t* featureLevel);
 
   /**
+   * Get the type of a given device.
+   *
+   * The device type can be used to help application developers to distribute
+   * Machine Learning workloads and other workloads such as graphical rendering.
+   * E.g., for an app which renders AR scenes based on real time object
+   * detection results, the developer could choose an ACCELERATOR type device
+   * for ML workloads, and reserve GPU for graphical rendering.
+   *
+   * @param device The representation of the specified device.
+   * @param type The returned {@link DeviceTypeCode} of the specified device.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getType)(const ANeuralNetworksDevice* device,
+                                       int32_t* type);
+
+  /**
    * Get the supported operations for a specified set of devices. If multiple
    * devices are selected, the supported operation list is a union of supported
    * operations of all selected devices.
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.cc b/tensorflow/lite/testing/kernel_test/input_generator.cc
index 897e185..5c69bdf 100644
--- a/tensorflow/lite/testing/kernel_test/input_generator.cc
+++ b/tensorflow/lite/testing/kernel_test/input_generator.cc
@@ -47,8 +47,8 @@
 std::vector<T> GenerateUniform(TfLiteIntArray* dims, float min, float max) {
   auto random_float = [](float min, float max) {
     // TODO(yunluli): Change seed for each invocation if needed.
-    static unsigned int seed;
-    return min + (max - min) * static_cast<float>(rand_r(&seed)) / RAND_MAX;
+    // Used rand() instead of rand_r() here to make it runnable on android.
+    return min + (max - min) * static_cast<float>(rand()) / RAND_MAX;
   };
 
   std::function<T(int)> random_t = [&](int) {
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index c7836f6..cef1774 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -106,7 +106,7 @@
     // We always want [min, max] to contain 0.
     float min = 0.f;
     float max = 0.f;
-    for (auto val : data) {
+    for (const auto& val : data) {
       min = std::min(min, val);
       max = std::max(max, val);
     }
@@ -121,7 +121,7 @@
     // weights arrays for which fake-quantization would make sense, rather
     // they tend to be hardcoded arrays of zeros or ones used in some graphs.
     bool is_quantization_trivially_exact = true;
-    for (auto val : data) {
+    for (const auto& val : data) {
       is_quantization_trivially_exact &= (val == min || val == max);
     }
     if (!is_quantization_trivially_exact) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
index 8879a7c..b9405e1 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -31,7 +31,7 @@
 template <typename Scalar>
 bool AreAllBufferElementsEqualTo(const std::vector<Scalar>& buffer_data,
                                  Scalar value) {
-  for (auto x : buffer_data) {
+  for (const auto& x : buffer_data) {
     if (x != value) {
       return false;
     }
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
index 2d9035b..ce0854b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -54,7 +54,7 @@
   // Reduction mask will be elementwise multiplied against the input
   // indices to figure out the output index for the element.
   std::vector<int> reduction_mask(input_shape.dimensions_count(), 1);
-  for (int axis : axes) {
+  for (const auto& axis : axes) {
     CHECK_GE(axis, 0);
     CHECK_LT(axis, input_shape.dimensions_count());
     reduction_mask[axis] = 0;
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index f53aae8..ca2477f 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -1097,7 +1097,7 @@
   std::unordered_map<string, string> reason_why_leftover;
   while (true) {
     bool inserted_something = false;
-    for (auto i : remaining) {
+    for (const auto& i : remaining) {
       bool can_insert = true;
       auto& op = old_operators[i];
       CHECK(op);
@@ -1167,7 +1167,7 @@
       }
       bad_inputs_already_traced.insert(bad_input);
       bad_op = nullptr;
-      for (auto i : remaining) {
+      for (const auto& i : remaining) {
         const Operator* op = old_operators[i].get();
         for (const string& output : op->outputs) {
           if (bad_input == output) {
@@ -1640,7 +1640,7 @@
       if (input_array_proto.has_shape()) {
         auto& input_array_dims = *input_array.mutable_shape()->mutable_dims();
         CheckValidShapeDimensions(input_array_proto.shape().dims());
-        for (auto dim : input_array_proto.shape().dims()) {
+        for (const auto& dim : input_array_proto.shape().dims()) {
           input_array_dims.push_back(dim);
         }
       }
@@ -2333,7 +2333,7 @@
         // Make sure to create the shape even if there are no dims, to
         // correctly record 0-D shapes.
         array.mutable_shape();
-        for (int dim : entry.shape().dims()) {
+        for (const auto& dim : entry.shape().dims()) {
           array.mutable_shape()->mutable_dims()->push_back(dim);
         }
       }
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 3ae7b8f..e4e86e0 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1011,7 +1011,7 @@
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "config_test",
     size = "small",
     srcs = ["framework/config_test.py"],
@@ -1185,6 +1185,9 @@
     name = "framework_test_lib",
     srcs = ["framework/test_util.py"],
     srcs_version = "PY2AND3",
+    visibility = visibility + [
+        "//tensorflow_estimator/python/estimator:__subpackages__",
+    ],
     deps = [
         ":array_ops",
         ":client",
@@ -3711,6 +3714,7 @@
     ],
     shard_count = 4,
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index 3009cff..5fb9bcb 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -36,7 +36,6 @@
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core.converter import ConversionOptions
 from tensorflow.python.autograph.core.converter import Feature
-from tensorflow.python.autograph.core.converter import Verbosity
 from tensorflow.python.autograph.core.errors import GraphConstructionError
 from tensorflow.python.autograph.core.errors import improved_errors
 from tensorflow.python.autograph.core.errors import TfRuntimeError
@@ -71,7 +70,6 @@
     'improved_errors',
     'GraphConstructionError',
     'TfRuntimeError',
-    'Verbosity',
     # Python language "extensions"
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index a35ff16..8366e19 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -129,7 +129,6 @@
         func=func,
         owner=owner,
         options=self.ctx.program.options.to_ast(
-            self.ctx,
             internal_convert_user_code=self.ctx.program.options.recursive),
         args=args,
         kwargs=kwargs)
diff --git a/tensorflow/python/autograph/converters/conditional_expressions.py b/tensorflow/python/autograph/converters/conditional_expressions.py
index a4eef7e..4538b16 100644
--- a/tensorflow/python/autograph/converters/conditional_expressions.py
+++ b/tensorflow/python/autograph/converters/conditional_expressions.py
@@ -27,7 +27,8 @@
 
   def visit_IfExp(self, node):
     return templates.replace_as_expression(
-        'ag__.if_stmt(test, lambda: true_expr, lambda: false_expr)',
+        '''ag__.if_stmt(test, lambda: true_expr,
+                        lambda: false_expr, lambda: (), lambda _: None)''',
         test=node.test,
         true_expr=node.body,
         false_expr=node.orelse)
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 9412f3d..c8dde80 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -88,23 +88,33 @@
       return templates.replace(
           template, body_name=body_name, body=body, return_stmt=return_stmt)
 
-  def _create_cond_expr(self, results, test, body_name, orelse_name):
+  def _create_cond_expr(self, results, test, body_name, orelse_name,
+                        state_getter_name,
+                        state_setter_name):
     if results is not None:
       template = """
-        results = ag__.if_stmt(test, body_name, orelse_name)
+        results = ag__.if_stmt(test, body_name, orelse_name,
+                               state_getter_name, state_setter_name)
       """
       return templates.replace(
           template,
           test=test,
           results=results,
           body_name=body_name,
-          orelse_name=orelse_name)
+          orelse_name=orelse_name,
+          state_getter_name=state_getter_name,
+          state_setter_name=state_setter_name)
     else:
       template = """
-        ag__.if_stmt(test, body_name, orelse_name)
+        ag__.if_stmt(test, body_name, orelse_name, getter_name, setter_name)
       """
       return templates.replace(
-          template, test=test, body_name=body_name, orelse_name=orelse_name)
+          template,
+          test=test,
+          body_name=body_name,
+          orelse_name=orelse_name,
+          getter_name=state_getter_name,
+          setter_name=state_setter_name)
 
   def _fmt_symbols(self, symbol_set):
     if not symbol_set:
@@ -138,6 +148,47 @@
           block_live_in.add(s)
     return scope.modified & node_defined_in & block_live_in
 
+  def _create_state_functions(self, composites,
+                              state_getter_name, state_setter_name):
+    if composites:
+      composite_tuple = tuple(composites)
+      template = """
+        def state_getter_name():
+          return composite_tuple,
+        def state_setter_name(vals):
+          composite_tuple, = vals
+      """
+      node = templates.replace(
+          template,
+          state_getter_name=state_getter_name,
+          state_setter_name=state_setter_name,
+          composite_tuple=composite_tuple)
+    else:
+      template = """
+        def state_getter_name():
+          return ()
+        def state_setter_name(_):
+          pass
+        """
+      node = templates.replace(
+          template,
+          state_getter_name=state_getter_name,
+          state_setter_name=state_setter_name)
+
+    return node
+
+  def _create_undefined_assigns(self, undefined_symbols):
+    assignments = []
+    for s in undefined_symbols:
+      template = '''
+        var = ag__.Undefined(symbol_name)
+      '''
+      assignments += templates.replace(
+          template,
+          var=s,
+          symbol_name=gast.Str(s.ssf()))
+    return assignments
+
   def visit_If(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
@@ -156,14 +207,17 @@
 
     modified_in_cond = body_scope.modified | orelse_scope.modified
     returned_from_cond = set()
+    composites = set()
     for s in modified_in_cond:
       if s in live_out:
         returned_from_cond.add(s)
-      elif s.is_composite():
-        # Special treatment for compound objects: if any of their owner entities
-        # are live, then they are outputs as well.
-        if live_out & s.owner_set:
-          returned_from_cond.add(s)
+      if s.is_composite():
+        # Special treatment for compound objects, always return them.
+        # This allows special handling within the if_stmt itself.
+        # For example, in TensorFlow we need to restore the state of composite
+        # symbols to ensure that only effects from the executed branch are seen.
+        returned_from_cond.add(s)
+        composites.add(s)
 
     created_in_body = body_scope.modified & returned_from_cond - defined_in
     created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
@@ -202,6 +256,9 @@
     cond_var_name = self.ctx.namer.new_symbol('cond', body_scope.referenced)
     body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
     orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
+    all_referenced = body_scope.referenced | orelse_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', all_referenced)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', all_referenced)
 
     returned_from_cond = tuple(returned_from_cond)
     if returned_from_cond:
@@ -245,27 +302,15 @@
         body=node_orelse,
         returns=returned_from_orelse)
     undefined_assigns = self._create_undefined_assigns(possibly_undefined)
+    composite_defs = self._create_state_functions(
+        composites, state_getter_name, state_setter_name)
 
     cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
-                                       orelse_name)
+                                       orelse_name, state_getter_name,
+                                       state_setter_name)
 
-    return (undefined_assigns
-            + cond_assign
-            + body_def
-            + orelse_def
-            + cond_expr)
-
-  def _create_undefined_assigns(self, undefined_symbols):
-    assignments = []
-    for s in undefined_symbols:
-      template = '''
-        var = ag__.Undefined(symbol_name)
-      '''
-      assignments += templates.replace(
-          template,
-          var=s,
-          symbol_name=gast.Str(s.ssf()))
-    return assignments
+    return (undefined_assigns + cond_assign + composite_defs + body_def +
+            orelse_def + cond_expr)
 
   def _get_loop_state(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index d332c50..9ad229c 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -251,6 +251,52 @@
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
   @test_util.run_deprecated_v1
+  def test_if_unbalanced_multiple_composites(self):
+
+    class Foo(object):
+
+      def __init__(self):
+        self.b = 2
+        self.c = 3
+
+    def test_fn(x, condition):
+
+      z = 5
+      if condition:
+        x.b = 7
+        x.c = 11
+        z = 13
+
+      return x.b, x.c, z
+
+    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(True)),
+                                 (7, 11, 13))
+    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(False)),
+                                 (2, 3, 5))
+
+  @test_util.run_deprecated_v1
+  def test_if_unbalanced_composite(self):
+
+    class Foo(object):
+
+      def __init__(self):
+        self.b = 2
+
+    def test_fn(x, condition):
+
+      z = 5
+      if condition:
+        x.b = 7
+        z = 13
+
+      return x.b, z
+
+    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(True)),
+                                 (7, 13))
+    self.assertTransformedResult(test_fn, (Foo(), constant_op.constant(False)),
+                                 (2, 5))
+
+  @test_util.run_deprecated_v1
   def test_simple_for(self):
 
     def test_fn(l):
diff --git a/tensorflow/python/autograph/converters/function_scopes_test.py b/tensorflow/python/autograph/converters/function_scopes_test.py
index 5a1248c..0eccf39 100644
--- a/tensorflow/python/autograph/converters/function_scopes_test.py
+++ b/tensorflow/python/autograph/converters/function_scopes_test.py
@@ -92,7 +92,7 @@
         return l, inner_fn(l)
 
     ns = {'TestClass': TestClass}
-    node, ctx = self.prepare(TestClass, ns, owner_type=TestClass)
+    node, ctx = self.prepare(TestClass, ns)
     node = function_scopes.transform(node, ctx)
 
     with self.compiled(node, {}, ops.name_scope) as result:
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 3a08483..2e106ed 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -63,17 +63,13 @@
 from __future__ import division
 from __future__ import print_function
 
-import weakref
-
 import enum
 
 from tensorflow.python.autograph.core import config
-from tensorflow.python.autograph.core import naming
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import compiler
-from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
@@ -83,7 +79,6 @@
 from tensorflow.python.autograph.pyct.static_analysis import liveness
 from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.pyct.static_analysis import type_info
-from tensorflow.python.eager import function
 from tensorflow.python.util.tf_export import tf_export
 
 # TODO(mdan): These contexts can be refactored into first class objects.
@@ -93,6 +88,7 @@
 # TODO(mdan): Add a test specific to this converter.
 
 
+# TODO(mdan): Remove when updating the API.
 @tf_export('autograph.experimental.Verbosity')
 class Verbosity(enum.IntEnum):
   """Represents conversion verbosity levels.
@@ -154,11 +150,6 @@
   Attributes:
     recursive: bool, whether to recursively convert any user functions or
       classes that the converted function may use.
-    verbose: Verbosity, the level of verbosity to use.
-    strip_decorators: Tuple[Callable], contains decorators that should be in
-      excluded from the compiled output. By default, when converting a function
-      before the decorators are applied, the compiled output will include those
-      decorators.
     force_conversion: bool, whether to force convertinng the target entity. When
       force_conversion is turned off, the converter may decide to return the
       function as-is.
@@ -169,14 +160,10 @@
 
   def __init__(self,
                recursive=False,
-               verbose=Verbosity.VERBOSE,
-               strip_decorators=None,
                force_conversion=False,
                internal_convert_user_code=True,
                optional_features=Feature.ALL):
     self.recursive = recursive
-    self.verbose = verbose
-    self._strip_decorators = strip_decorators or ()
     self.force_conversion = force_conversion
     # TODO(mdan): Rename to conversion_recursion_depth?
     self.internal_convert_user_code = internal_convert_user_code
@@ -188,34 +175,17 @@
     optional_features = frozenset(optional_features)
     self.optional_features = optional_features
 
-  @property
-  def strip_decorators(self):
-    # A few decorators are included by default.
-    # TODO(mdan): Revert if function.defun becomes a public symbol.
-    return self._strip_decorators + (function.defun,)
-
-  def should_strip(self, decorator):
-    for blacklisted in self.strip_decorators:
-      if blacklisted is decorator:
-        return True
-      if isinstance(blacklisted, weakref.ref):
-        blacklisted_deref = blacklisted()
-        if (blacklisted_deref is not None and blacklisted_deref is decorator):
-          return True
-    return False
-
   def uses(self, feature):
     return (Feature.ALL in self.optional_features or
             feature in self.optional_features)
 
-  def to_ast(self, ctx, internal_convert_user_code=None):
+  def to_ast(self, internal_convert_user_code=None):
     """Returns a representation of this object as an AST node.
 
     The AST node encodes a constructor that would create an object with the
     same contents.
 
     Args:
-      ctx: EntityContext, the entity with which this AST needs to be consistent.
       internal_convert_user_code: Optional[bool], allows ovrriding the
         corresponding value.
 
@@ -225,30 +195,11 @@
     template = """
       ag__.ConversionOptions(
           recursive=recursive_val,
-          verbose=verbose_val,
-          strip_decorators=strip_decorators_val,
           force_conversion=force_conversion_val,
           optional_features=optional_features_val,
           internal_convert_user_code=internal_convert_user_code_val)
     """
 
-    def as_qualified_name(o):
-      name = inspect_utils.getqualifiedname(ctx.info.namespace, o, max_depth=1)
-      if not name:
-        if isinstance(o, weakref.ref):
-          # `o` might already be a weak reference, if this object was
-          # constructed from code generated by `to_ast` itself.
-          # If so, unpack it.
-          o = o()
-        # TODO(mdan): This needs to account for the symbols defined locally.
-        name = ctx.namer.new_symbol(o.__name__, ())
-        ctx.program.add_symbol(name, weakref.ref(o))
-      return name
-
-    def list_of_names(values):
-      return parser.parse_expression('({})'.format(', '.join(
-          tuple(as_qualified_name(v) for v in values))))
-
     def list_of_features(values):
       return parser.parse_expression('({})'.format(', '.join(
           'ag__.{}'.format(str(v)) for v in values)))
@@ -259,8 +210,6 @@
     expr_ast = templates.replace(
         template,
         recursive_val=parser.parse_expression(str(self.recursive)),
-        verbose_val=parser.parse_expression(str(int(self.verbose))),
-        strip_decorators_val=list_of_names(self._strip_decorators),
         force_conversion_val=parser.parse_expression(
             str(self.force_conversion)),
         internal_convert_user_code_val=parser.parse_expression(
@@ -276,95 +225,26 @@
 
   Attributes:
     options: ConversionOptions
-    dependency_cache: Dict[Any, ast.AST], the original entities mapped to their
-      converted AST
-    additional_imports: Set[Any], additional entities which for any reason
-      cannot be attached after loading and need to be explicitly imported in the
-      generated code
-    name_map: Dict[str, str], map of original entity name to the name of their
-      converted counterparts
     autograph_module: Module, a reference to the autograph module. This needs to
       be specified by the caller to avoid circular dependencies.
-    uncompiled_modules: Set[Tuple[str, ...]], with each tuple representing the
-      fully qualified name of a package containing functions that will not be
-      compiled.
     required_imports: str, containing an import statement on each line. These
       are all the imports necessary for the compiled code to run, in addition to
       the closures of each entity, which are attached dynamically.
-    partial_types: Tuple[Type], deprecated.
-    conversion_order: Tuple[Any], deprecated.
-    additional_symbols: Dict[str, Any], a map of new symbols that have been
-      created under this context, and need to be added to the namespace of the
-      generated code.
   """
 
   def __init__(
       self,
       options,
-      partial_types,
       autograph_module,
-      uncompiled_modules,
   ):
     self.options = options
-    self.partial_types = partial_types if partial_types else ()
     self.autograph_module = autograph_module
-    self.uncompiled_modules = uncompiled_modules
-
-    self.conversion_order = []
-    self.dependency_cache = {}
-    self.additional_imports = set()
-    self.name_map = {}
-    self.additional_symbols = {}
 
   @property
   def required_imports(self):
     """Returns a block containing all imports required by the converted code."""
     # TODO(mdan): Check that these don't clobber one another.
-    return '\n'.join(config.COMPILED_IMPORT_STATEMENTS +
-                     tuple(self.additional_imports))
-
-  def new_namer(self, namespace):
-    return naming.Namer(namespace, self.options.recursive, self.name_map,
-                        self.partial_types)
-
-  def update_name_map(self, namer):
-    """Updates renamed_calls based on the recent activity from the namer.
-
-    Whenever we convert a new entity, any references to other entities are being
-    renamed to match their soon-to-be-converted counterparts. The namer keeps
-    track of these renames. When conversion is complete, we copy those renames
-    so that when those referenced entities are being converted, their new name
-    matches.
-
-    Args:
-      namer: naming.Namer
-
-    Raises:
-      ValueError: when an entity was renamed twice and to different names.
-    """
-    # TODO(mdan): Have call_trees do this directly.
-    # This is done so indirectly, via the namer, for historic reasons. But
-    # now we can have the converter that does the rename record the new name
-    # as well and skip this step altogether.
-    for o, name in namer.renamed_calls.items():
-      if o in self.name_map:
-        if self.name_map[o] != name:
-          raise ValueError(
-              'Calls to %s were converted using multiple names (%s). This is '
-              'possible when an entity with one of these names already '
-              'existed. To fix, avoid using any of these names.' %
-              (o, (name, self.name_map[o])))
-      else:
-        self.name_map[o] = name
-
-  def add_symbol(self, name, value):
-    if name in self.additional_symbols:
-      assert self.additional_symbols[name] is value
-    self.additional_symbols[name] = value
-
-  def add_to_cache(self, original_entity, converted_ast):
-    self.conversion_order.append(original_entity)
-    self.dependency_cache[original_entity] = converted_ast
+    return '\n'.join(config.COMPILED_IMPORT_STATEMENTS)
 
 
 class EntityContext(transformer.Context):
diff --git a/tensorflow/python/autograph/core/converter_test.py b/tensorflow/python/autograph/core/converter_test.py
index 4050878..938569b 100644
--- a/tensorflow/python/autograph/core/converter_test.py
+++ b/tensorflow/python/autograph/core/converter_test.py
@@ -12,21 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for lists module."""
+"""Tests for converter module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import weakref
-
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
-from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.platform import test
 
 
@@ -38,22 +35,7 @@
 
   def test_to_ast(self):
     opts = converter.ConversionOptions()
-
-    namer = converter_testing.FakeNamer()
-    program_ctx = converter.ProgramContext(
-        options=opts,
-        partial_types=None,
-        autograph_module=None,
-        uncompiled_modules=())
-    entity_info = transformer.EntityInfo(
-        source_code='',
-        source_file='<fragment>',
-        namespace={},
-        arg_values=None,
-        arg_types={},
-        owner_type=None)
-    ctx = converter.EntityContext(namer, entity_info, program_ctx)
-    opts_ast = opts.to_ast(ctx)
+    opts_ast = opts.to_ast()
 
     template = '''
     def test_fn():
@@ -68,40 +50,12 @@
     reparsed_opts = reparsed.test_fn()
 
     self.assertEqual(opts.recursive, reparsed_opts.recursive)
-    self.assertEqual(opts.verbose, reparsed_opts.verbose)
     self.assertEqual(opts.force_conversion, reparsed_opts.force_conversion)
     self.assertEqual(
         opts.internal_convert_user_code,
         reparsed_opts.internal_convert_user_code)
     self.assertEqual(opts.optional_features, reparsed_opts.optional_features)
 
-  def test_should_strip_weakrefs(self):
-    def test_fn():
-      pass
-
-    def weak_test_fn_a():
-      pass
-
-    def weak_test_fn_b():
-      pass
-
-    def weak_test_fn_c():
-      pass
-
-    wr_a = weakref.ref(weak_test_fn_a)
-    # Create an extra weakref to check whether the existence of multiple weak
-    # references influences the process.
-    _ = weakref.ref(weak_test_fn_b)
-    wr_b = weakref.ref(weak_test_fn_b)
-    _ = weakref.ref(weak_test_fn_c)
-
-    opts = converter.ConversionOptions(strip_decorators=(test_fn, wr_a, wr_b))
-
-    self.assertTrue(opts.should_strip(test_fn))
-    self.assertTrue(opts.should_strip(weak_test_fn_a))
-    self.assertTrue(opts.should_strip(weak_test_fn_b))
-    self.assertFalse(opts.should_strip(weak_test_fn_c))
-
 
 class ConverterBaseTest(converter_testing.TestCase):
 
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index 56445db..81b4b9f 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -26,13 +26,12 @@
 
 from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
-from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.core import naming
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import compiler
-from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import pretty_printer
@@ -42,38 +41,6 @@
 RESULT_OF_MOCK_CONVERTED_CALL = 7
 
 
-# TODO(mdan): We should use the real namer here.
-class FakeNamer(object):
-  """A fake namer that uses a global counter to generate unique names."""
-
-  def __init__(self):
-    self.i = 0
-    self.partial_types = ()
-
-  def new_symbol(self, name_root, used):
-    while True:
-      self.i += 1
-      name = '%s%d' % (name_root, self.i)
-      if name not in used:
-        return name
-
-  def compiled_function_name(self,
-                             original_fqn,
-                             live_entity=None,
-                             owner_type=None):
-    if inspect_utils.islambda(live_entity):
-      return None, False
-    if owner_type is not None:
-      return None, False
-    return ('renamed_%s' % '_'.join(original_fqn)), True
-
-
-class FakeNoRenameNamer(FakeNamer):
-
-  def compiled_function_name(self, original_fqn, **_):
-    return str(original_fqn), False
-
-
 class TestCase(test.TestCase):
   """Base class for unit tests in this module. Contains relevant utilities."""
 
@@ -152,35 +119,21 @@
     for k, v in ns.items():
       setattr(module, k, v)
 
-  def prepare(self,
-              test_fn,
-              namespace,
-              namer=None,
-              arg_types=None,
-              owner_type=None,
-              recursive=True,
-              strip_decorators=()):
+  def prepare(self, test_fn, namespace, arg_types=None, recursive=True):
     namespace['ConversionOptions'] = converter.ConversionOptions
 
     node, source = parser.parse_entity(test_fn)
     node = node.body[0]
-    if namer is None:
-      namer = FakeNamer()
+    namer = naming.Namer(namespace)
     program_ctx = converter.ProgramContext(
-        options=converter.ConversionOptions(
-            recursive=recursive,
-            strip_decorators=strip_decorators,
-            verbose=True),
-        partial_types=None,
-        autograph_module=None,
-        uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
+        options=converter.ConversionOptions(recursive=recursive),
+        autograph_module=None)
     entity_info = transformer.EntityInfo(
         source_code=source,
         source_file='<fragment>',
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types,
-        owner_type=owner_type)
+        arg_types=arg_types)
     ctx = converter.EntityContext(namer, entity_info, program_ctx)
     origin_info.resolve(node, source, test_fn)
     node = converter.standard_analysis(node, ctx, is_initial=True)
diff --git a/tensorflow/python/autograph/core/naming.py b/tensorflow/python/autograph/core/naming.py
index 245795c..aa23779 100644
--- a/tensorflow/python/autograph/core/naming.py
+++ b/tensorflow/python/autograph/core/naming.py
@@ -20,7 +20,6 @@
 
 import enum
 
-from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.utils import misc
 
@@ -31,27 +30,10 @@
 
 
 class Namer(object):
-  """Implementation of the namer interfaces required by various converters.
+  """Symbol name generartor."""
 
-  This implementation performs additional tasks like keeping track of the
-  function calls that have been encountered and replaced with calls to their
-  corresponding compiled counterparts.
-
-  Interfaces currently implemented:
-    * call_trees.FunctionNamer
-    * control_flow.SymbolNamer
-    * side_effect_guards.SymbolNamer
-  """
-
-  def __init__(self, global_namespace, recursive, name_map, partial_types):
+  def __init__(self, global_namespace):
     self.global_namespace = global_namespace
-    self.recursive = recursive
-    self.partial_types = partial_types
-
-    self.renamed_calls = {}
-    if name_map is not None:
-      self.renamed_calls.update(name_map)
-
     self.generated_names = set()
 
   def _as_symbol_name(self, fqn, style=_NamingStyle.SNAKE):
@@ -92,11 +74,8 @@
     elif style == _NamingStyle.SNAKE:
       return '_'.join(pieces)
 
-  def compiled_class_name(self, original_fqn, live_entity=None):
-    """See call_trees.FunctionNamer.compiled_class_name."""
-    if live_entity is not None and live_entity in self.renamed_calls:
-      return self.renamed_calls[live_entity]
-
+  def class_name(self, original_fqn):
+    """Returns the name of a converted class."""
     canonical_name = self._as_symbol_name(
         original_fqn, style=_NamingStyle.CAMEL)
     new_name_root = 'Tf%s' % canonical_name
@@ -105,30 +84,11 @@
     while new_name in self.global_namespace:
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
-
     self.generated_names.add(new_name)
-    if live_entity is not None:
-      self.renamed_calls[live_entity] = new_name
     return new_name
 
-  def compiled_function_name(self,
-                             original_fqn,
-                             live_entity=None,
-                             owner_type=None):
-    """See call_trees.FunctionNamer.compiled_function_name."""
-    if not self.recursive:
-      return None, False
-
-    if (live_entity is not None and inspect_utils.islambda(live_entity)):
-      return None, False
-
-    if owner_type is not None and owner_type not in self.partial_types:
-      # Members are not renamed when part of an entire converted class.
-      return None, False
-
-    if live_entity is not None and live_entity in self.renamed_calls:
-      return self.renamed_calls[live_entity], True
-
+  def function_name(self, original_fqn):
+    """Returns the name of a converted function."""
     canonical_name = self._as_symbol_name(
         original_fqn, style=_NamingStyle.SNAKE)
     new_name_root = 'tf__%s' % canonical_name
@@ -137,12 +97,8 @@
     while new_name in self.global_namespace:
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
-
-    if live_entity is not None:
-      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
-
-    return new_name, True
+    return new_name
 
   def new_symbol(self, name_root, reserved_locals):
     """See control_flow.SymbolNamer.new_symbol."""
diff --git a/tensorflow/python/autograph/core/naming_test.py b/tensorflow/python/autograph/core/naming_test.py
index cc8c431..49526ed 100644
--- a/tensorflow/python/autograph/core/naming_test.py
+++ b/tensorflow/python/autograph/core/naming_test.py
@@ -24,64 +24,47 @@
 
 class NamerTest(test.TestCase):
 
-  def test_compiled_function_name_tracks_names(self):
-    def bar():
-      pass
-
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual(('tf__foo', True), namer.compiled_function_name('foo'))
-    self.assertEqual(('tf__bar', True), namer.compiled_function_name(
-        'bar', bar))
-    self.assertEqual({bar: 'tf__bar'}, namer.renamed_calls)
+  def test_function_name_tracks_names(self):
+    namer = naming.Namer({})
+    self.assertEqual('tf__foo', namer.function_name('foo'))
+    self.assertEqual('tf__bar', namer.function_name('bar'))
     self.assertItemsEqual(('tf__bar', 'tf__foo'), namer.generated_names)
 
-  def test_compiled_function_name_consistent(self):
-    def foo():
-      pass
+  def test_function_name_consistent(self):
+    namer = naming.Namer({})
+    self.assertEqual('tf__foo', namer.function_name('foo'))
+    self.assertEqual('tf__foo', namer.function_name('foo'))
 
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
-        'foo', foo))
-    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
-        'foo', foo))
+  def test_function_name_unsanitized_fqn(self):
+    namer = naming.Namer({})
+    self.assertEqual('tf__foo_bar', namer.function_name('foo.bar'))
+    self.assertEqual('tf__foo_bar_baz', namer.function_name(('foo.bar', 'baz')))
 
-  def test_compiled_function_name_unsanitized_fqn(self):
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual(('tf__foo_bar', True),
-                     namer.compiled_function_name('foo.bar'))
-    self.assertEqual(('tf__foo_bar_baz', True), namer.compiled_function_name(
-        ('foo.bar', 'baz')))
+  def test_class_name_basic(self):
+    namer = naming.Namer({})
+    self.assertEqual('TfFooBar', namer.class_name(('foo', 'Bar')))
 
-  def test_compiled_class_name_basic(self):
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual('TfFooBar', namer.compiled_class_name(('foo', 'Bar')))
+  def test_class_name_unsanitized_fqn(self):
+    namer = naming.Namer({})
+    self.assertEqual('TfFooBarBaz', namer.class_name(('foo.bar', 'Baz')))
 
-  def test_compiled_class_name_unsanitized_fqn(self):
-    namer = naming.Namer({}, True, None, ())
-    self.assertEqual('TfFooBarBaz',
-                     namer.compiled_class_name(('foo.bar', 'Baz')))
-
-  def test_compiled_function_name_avoids_global_conflicts(self):
-    def foo():
-      pass
-
-    namer = naming.Namer({'tf__foo': 1}, True, None, ())
-    self.assertEqual(('tf__foo_1', True),
-                     namer.compiled_function_name('foo', foo))
+  def test_function_name_avoids_global_conflicts(self):
+    namer = naming.Namer({'tf__foo': 1})
+    self.assertEqual('tf__foo_1', namer.function_name('foo'))
 
   def test_new_symbol_tracks_names(self):
-    namer = naming.Namer({}, True, None, ())
+    namer = naming.Namer({})
     self.assertEqual('temp', namer.new_symbol('temp', set()))
     self.assertItemsEqual(('temp',), namer.generated_names)
 
   def test_new_symbol_avoids_duplicates(self):
-    namer = naming.Namer({}, True, None, ())
+    namer = naming.Namer({})
     self.assertEqual('temp', namer.new_symbol('temp', set()))
     self.assertEqual('temp_1', namer.new_symbol('temp', set()))
     self.assertItemsEqual(('temp', 'temp_1'), namer.generated_names)
 
   def test_new_symbol_avoids_conflicts(self):
-    namer = naming.Namer({'temp': 1}, True, None, ())
+    namer = naming.Namer({'temp': 1})
     # temp is reserved in the global namespace
     self.assertEqual('temp_1', namer.new_symbol('temp', set()))
     # temp_2 is reserved in the local namespace
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 5ed125e..9e223d1 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -33,7 +33,6 @@
 # pylint:enable=g-bad-import-order
 
 
-from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import conversion
 from tensorflow.python.autograph.operators import py_builtins
@@ -63,7 +62,6 @@
 # to write converter.
 def convert(
     recursive=False,
-    verbose=converter.Verbosity.BRIEF,
     optional_features=converter.Feature.ALL):
   """Decorator that compiles a function to use TensorFlow ops.
 
@@ -75,7 +73,6 @@
   Args:
     recursive: bool, whether to recursively convert any functions or classes
       that the converted function may use.
-    verbose: converter.Verbosity, the level of verbosity.
     optional_features: converted.Feature, allows toggling optional or
       experimental features. When set to None, only the core features are
       enabled.
@@ -94,7 +91,6 @@
           f, None,
           converter.ConversionOptions(
               recursive=recursive,
-              verbose=verbose,
               force_conversion=True,
               optional_features=optional_features,
           ), args, kwargs)
@@ -293,10 +289,8 @@
       # TODO(b/119246461): This may be more elegantly handled using __get__?
       if f_self is not None:
         effective_args = (f_self,) + args
-        partial_types = (f_self,)
       else:
         effective_args = args
-        partial_types = ()
 
     elif tf_inspect.isclass(f):
       # Constructors
@@ -306,14 +300,12 @@
       target_entity = f
       arg_map_target = f.__init__
       effective_args = args
-      partial_types = ()
 
     elif hasattr(f, '__call__') and hasattr(f, '__class__'):
       # Callable objects
       target_entity = f.__call__
       arg_map_target = f.__call__
       effective_args = (f,) + args
-      partial_types = (f.__class__,)
 
     else:
       target_entity = f
@@ -325,29 +317,12 @@
       arg_class = arg.__class__
       arg_types[name] = (arg_class.__name__, arg_class)
 
-    # When called from within a decorator, this is the only indication that
-    # the function is a method - it appears that the decorator is applied
-    # before the method is bound.
-    if not partial_types:
-      if 'self' in arg_values:
-        if tf_inspect.isclass(arg_values['self'].__class__):
-          partial_types = (arg_values['self'].__class__,)
-      elif 'cls' in arg_values:
-        if tf_inspect.isclass(arg_values['cls']):
-          partial_types = (arg_values['cls'],)
-
-    logging.log(3, 'Partial types in conversion of %s: %s', target_entity,
-                partial_types)
-
     converted_f = to_graph(
         target_entity,
         recursive=options.recursive,
         arg_values=arg_values,
         arg_types=arg_types,
-        experimental_optional_features=options.optional_features,
-        experimental_strip_decorators=options.strip_decorators,
-        experimental_verbose=options.verbose,
-        experimental_partial_types=partial_types)
+        experimental_optional_features=options.optional_features)
 
     if logging.has_verbosity(2):
       logging.log(2, 'Defaults of %s : %s', converted_f,
@@ -402,6 +377,7 @@
   return False
 
 
+# TODO(mdan): Remove obsolete args.
 @tf_export('autograph.to_graph')
 def to_graph(entity,
              recursive=True,
@@ -466,14 +442,9 @@
     experimental_optional_features: `None`, a tuple of, or a single
       `tf.autograph.experimental.Feature` value. Controls the use of
       optional features in the conversion process.
-    experimental_strip_decorators: A tuple specifying decorators that should be
-      excluded from the compiled output. By default, when converting a function
-      before the decorators are applied, the compiled output will include those
-      decorators.
-    experimental_verbose: The level of printing verbosity to use, as a
-      `tf.autograph.experimental.Verbosity` value.
-    experimental_partial_types: A `set` of `type` values, reserved for internal
-      use.
+    experimental_strip_decorators: Deprecated, unused.
+    experimental_verbose: Deprecated, unused.
+    experimental_partial_types: Deprecated, unused.
 
   Returns:
     Same as `entity`, the converted Python function or class.
@@ -481,26 +452,18 @@
   Raises:
     ValueError: If the entity could not be converted.
   """
-  try:
-    if experimental_strip_decorators is None:
-      experimental_strip_decorators = ()
-    experimental_strip_decorators += (convert, do_not_convert, converted_call)
+  del experimental_strip_decorators
+  del experimental_verbose
+  del experimental_partial_types
 
+  try:
     program_ctx = converter.ProgramContext(
         options=converter.ConversionOptions(
             recursive=recursive,
-            verbose=experimental_verbose,
-            strip_decorators=experimental_strip_decorators,
             optional_features=experimental_optional_features),
-        partial_types=experimental_partial_types,
-        autograph_module=tf_inspect.getmodule(to_graph),
-        uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
-    _, name, namespace = conversion.entity_to_graph(entity, program_ctx,
-                                                    arg_values, arg_types)
-
-    nodes = []
-    for dep in reversed(program_ctx.conversion_order):
-      nodes.extend(program_ctx.dependency_cache[dep])
+        autograph_module=tf_inspect.getmodule(to_graph))
+    nodes, name, namespace = conversion.entity_to_graph(entity, program_ctx,
+                                                        arg_values, arg_types)
 
     compiled_module, _ = compiler.ast_to_object(
         nodes,
@@ -513,9 +476,6 @@
       # Avoid overwriting entities that have been transformed.
       if key not in compiled_module.__dict__:
         compiled_module.__dict__[key] = val
-    for key, val in program_ctx.additional_symbols.items():
-      if key not in compiled_module.__dict__:
-        compiled_module.__dict__[key] = val
     compiled = getattr(compiled_module, name)
 
     if hasattr(entity, '__defaults__'):
@@ -584,25 +544,21 @@
     experimental_optional_features: `None`, a tuple of, or a single
       `tf.autograph.experimental.Feature` value. Controls the use of
       optional features in the conversion process.
-    experimental_partial_types: A `set` of `type` values, reserved for internal
-      use.
+    experimental_partial_types:  Deprecated, unused.
 
   Returns:
     The converted code as string.
   """
+  del experimental_partial_types
+
   program_ctx = converter.ProgramContext(
       options=converter.ConversionOptions(
           recursive=recursive,
-          verbose=converter.Verbosity.BRIEF,
-          strip_decorators=(convert, do_not_convert, converted_call),
           optional_features=experimental_optional_features),
-      partial_types=experimental_partial_types,
-      autograph_module=tf_inspect.getmodule(to_graph),
-      uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
-  conversion.entity_to_graph(entity, program_ctx, arg_values, arg_types)
+      autograph_module=tf_inspect.getmodule(to_graph))
+  nodes, _, _ = conversion.entity_to_graph(entity, program_ctx, arg_values,
+                                           arg_types)
 
-  code = '\n'.join(
-      compiler.ast_to_source(program_ctx.dependency_cache[dep], indentation)
-      for dep in reversed(program_ctx.conversion_order))
+  code = compiler.ast_to_source(nodes, indentation)
 
   return program_ctx.required_imports + '\n\n' + code
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index cdada2c..79a29ca 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -372,7 +372,8 @@
                            (constant_op.constant(0),), {})
     self.assertTrue(self.evaluate(x))
 
-    converted_f = api.to_graph(f)
+    converted_f = api.to_graph(
+        f, experimental_optional_features=converter.Feature.ALL)
     x = api.converted_call(converted_f, None, converter.ConversionOptions(),
                            (constant_op.constant(0),), {})
     self.assertTrue(self.evaluate(x))
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 90a7738..a9913ef 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -46,6 +46,7 @@
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors as ag_errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.core import naming
 from tensorflow.python.autograph.core import unsupported_features_checker
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import ast_util
@@ -185,11 +186,11 @@
   logging.log(1, 'Converting %s', o)
 
   if tf_inspect.isclass(o):
-    node, name, ns = class_to_graph(o, program_ctx)
+    nodes, name, ns = class_to_graph(o, program_ctx)
   elif tf_inspect.isfunction(o):
-    node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
+    nodes, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   elif tf_inspect.ismethod(o):
-    node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
+    nodes, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   # TODO(mdan,yashkatariya): Remove when object conversion is implemented.
   elif hasattr(o, '__class__'):
     raise NotImplementedError(
@@ -212,39 +213,22 @@
   template = '''
       entity.autograph_info__ = {}
   '''
-  node.extend(templates.replace(template, entity=name))
-
-  program_ctx.add_to_cache(o, node)
+  nodes.extend(templates.replace(template, entity=name))
 
   if logging.has_verbosity(2):
     logging.log(2, 'Compiled output of %s:\n\n%s\n', o,
-                compiler.ast_to_source(node))
+                compiler.ast_to_source(nodes))
   if logging.has_verbosity(4):
-    for n in node:
+    for n in nodes:
       logging.log(4, 'Compiled AST of %s:\n\n%s\n\n', o,
                   pretty_printer.fmt(n, color=False))
 
-  if program_ctx.options.recursive:
-    while True:
-      candidate = None
-      for obj in program_ctx.name_map.keys():
-        if obj not in program_ctx.dependency_cache:
-          candidate = obj
-          break
-      if candidate is None:
-        break
-      if (hasattr(candidate, 'im_class') and
-          getattr(candidate, 'im_class') not in program_ctx.partial_types):
-        # Class members are converted with their objects, unless they're
-        # only converted partially.
-        continue
-      entity_to_graph(candidate, program_ctx, {}, {})
-
-  return node, name, ns
+  return nodes, name, ns
 
 
 def class_to_graph(c, program_ctx):
   """Specialization of `entity_to_graph` for classes."""
+  # TODO(mdan): Revisit this altogether. Not sure we still need it.
   converted_members = {}
   method_filter = lambda m: tf_inspect.isfunction(m) or tf_inspect.ismethod(m)
   members = tf_inspect.getmembers(c, predicate=method_filter)
@@ -256,25 +240,22 @@
     # Only convert the members that are directly defined by the class.
     if inspect_utils.getdefiningclass(m, c) is not c:
       continue
-    node, _, namespace = function_to_graph(
+    nodes, _, namespace = function_to_graph(
         m,
         program_ctx=program_ctx,
         arg_values={},
         arg_types={'self': (c.__name__, c)},
-        owner_type=c)
+        do_rename=False)
     if class_namespace is None:
       class_namespace = namespace
     else:
       class_namespace.update(namespace)
-    converted_members[m] = node[0]
-  namer = program_ctx.new_namer(class_namespace)
-  class_name = namer.compiled_class_name(c.__name__, c)
+    converted_members[m] = nodes[0]
+  namer = naming.Namer(class_namespace)
+  class_name = namer.class_name(c.__name__)
 
-  # TODO(mdan): This needs to be explained more thoroughly.
   # Process any base classes: if the superclass if of a whitelisted type, an
-  # absolute import line is generated. Otherwise, it is marked for conversion
-  # (as a side effect of the call to namer.compiled_class_name() followed by
-  # program_ctx.update_name_map(namer)).
+  # absolute import line is generated.
   output_nodes = []
   renames = {}
   base_names = []
@@ -290,11 +271,12 @@
               names=[gast.alias(name=base.__name__, asname=alias)],
               level=0))
     else:
-      # This will trigger a conversion into a class with this name.
-      alias = namer.compiled_class_name(base.__name__, base)
+      raise NotImplementedError(
+          'Conversion of classes that do not directly extend classes from'
+          ' whitelisted modules is temporarily suspended. If this breaks'
+          ' existing code please notify the AutoGraph team immediately.')
     base_names.append(alias)
     renames[qual_names.QN(base.__name__)] = qual_names.QN(alias)
-  program_ctx.update_name_map(namer)
 
   # Generate the definition of the converted class.
   bases = [gast.Name(n, gast.Load(), None) for n in base_names]
@@ -326,6 +308,7 @@
 ag_internal = None
 
 
+# TODO(mdan): Move into core or replace with an actual importable module.
 def _add_self_references(namespace, autograph_module):
   """Adds namespace references to the module that exposes the api itself."""
   global ag_internal
@@ -349,11 +332,7 @@
   _add_reserved_symbol(namespace, 'ag__', ag_internal)
 
 
-def function_to_graph(f,
-                      program_ctx,
-                      arg_values,
-                      arg_types,
-                      owner_type=None):
+def function_to_graph(f, program_ctx, arg_values, arg_types, do_rename=True):
   """Specialization of `entity_to_graph` for callable functions."""
 
   node, source = parser.parse_entity(f)
@@ -379,15 +358,14 @@
   origin_info.resolve(node, source, f)
   namespace = inspect_utils.getnamespace(f)
   _add_self_references(namespace, program_ctx.autograph_module)
-  namer = program_ctx.new_namer(namespace)
+  namer = naming.Namer(namespace)
 
   entity_info = transformer.EntityInfo(
       source_code=source,
       source_file='<fragment>',
       namespace=namespace,
       arg_values=arg_values,
-      arg_types=arg_types,
-      owner_type=owner_type)
+      arg_types=arg_types)
   context = converter.EntityContext(namer, entity_info, program_ctx)
   try:
     node = node_to_graph(node, context)
@@ -401,18 +379,13 @@
     node = gast.Assign(
         targets=[gast.Name(new_name, gast.Store(), None)], value=node)
 
-  else:
+  elif do_rename:
     # TODO(mdan): This somewhat duplicates the renaming logic in call_trees.py
-    new_name, did_rename = namer.compiled_function_name(f.__name__, f,
-                                                        owner_type)
-    if did_rename:
-      node.name = new_name
-    else:
-      new_name = f.__name__
-      assert node.name == new_name
-
-  program_ctx.update_name_map(namer)
-  # TODO(mdan): Use this at compilation.
+    new_name = namer.function_name(f.__name__)
+    node.name = new_name
+  else:
+    new_name = f.__name__
+    assert node.name == new_name
 
   return [node], new_name, namespace
 
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index ddda408..7902fa6 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -21,11 +21,10 @@
 import gast
 
 from tensorflow.python.autograph import utils
-from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import api
-from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.impl import conversion
+from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.keras.engine import training
 from tensorflow.python.platform import test
@@ -36,9 +35,7 @@
   def _simple_program_ctx(self):
     return converter.ProgramContext(
         options=converter.ConversionOptions(recursive=True),
-        partial_types=(),
-        autograph_module=api,
-        uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
+        autograph_module=api)
 
   def test_is_whitelisted_for_graph(self):
 
@@ -89,11 +86,8 @@
       return g(a)
 
     program_ctx = self._simple_program_ctx()
-    conversion.entity_to_graph(f, program_ctx, None, None)
-
-    self.assertTrue(f in program_ctx.dependency_cache)
-    self.assertFalse(g in program_ctx.dependency_cache)
-    f_node = program_ctx.dependency_cache[f][0]
+    nodes, _, _ = conversion.entity_to_graph(f, program_ctx, None, None)
+    f_node = nodes[0]
     self.assertEqual('tf__f', f_node.name)
 
   def test_entity_to_graph_class_hierarchy(self):
@@ -122,16 +116,8 @@
         return self.y
 
     program_ctx = self._simple_program_ctx()
-    conversion.entity_to_graph(TestSubclass, program_ctx, None, None)
-
-    self.assertTrue(TestBase in program_ctx.dependency_cache)
-    self.assertTrue(TestSubclass in program_ctx.dependency_cache)
-    # The returned nodes will include:
-    # <import nodes>, <class node>, <assignment node>
-    self.assertEqual('TfTestBase',
-                     program_ctx.dependency_cache[TestBase][-2].name)
-    self.assertEqual('TfTestSubclass',
-                     program_ctx.dependency_cache[TestSubclass][-2].name)
+    with self.assertRaisesRegex(NotImplementedError, 'classes.*whitelisted'):
+      conversion.entity_to_graph(TestSubclass, program_ctx, None, None)
 
   def test_entity_to_graph_class_hierarchy_whitelisted(self):
 
@@ -145,16 +131,12 @@
         return 3 * x
 
     program_ctx = self._simple_program_ctx()
-    conversion.entity_to_graph(TestSubclass, program_ctx, None, None)
+    nodes, name, _ = conversion.entity_to_graph(TestSubclass, program_ctx, None,
+                                                None)
+    class_node = nodes[-2]  # TODO(mdan): This is brittle.
 
-    self.assertTrue(TestSubclass in program_ctx.dependency_cache)
-    self.assertFalse(training.Model in program_ctx.dependency_cache)
-    self.assertEqual(
-        'Model', program_ctx.dependency_cache[TestSubclass][0].names[0].name)
-    # The returned nodes will include:
-    # <import nodes>, <class node>, <assignment node>
-    self.assertEqual('TfTestSubclass',
-                     program_ctx.dependency_cache[TestSubclass][-2].name)
+    self.assertEqual(name, 'TfTestSubclass')
+    self.assertEqual(class_node.name, 'TfTestSubclass')
 
   def test_entity_to_graph_lambda(self):
     b = 2
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index b58f971..365afdd 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -228,7 +228,7 @@
   return state
 
 
-def if_stmt(cond, body, orelse):
+def if_stmt(cond, body, orelse, get_state, set_state):
   """Functional form of an if statement.
 
   Args:
@@ -237,25 +237,51 @@
         as return type.
     orelse: Callable with no arguments, and outputs of the negative (else)
         branch as return type.
+    get_state: Function that returns a tuple containing the values of all
+        composite symbols modified within the conditional. This allows access to
+        state that branches may mutate through side effects. This function is
+        not needed and should not be called when dispatching to code matching
+        Python's default semantics. This is useful for checkpointing to avoid
+        unintended side-effects when staging requires evaluating all code-paths.
+    set_state: Function to set the values of all composite symbols modified
+        within the conditional. This is the complement to get_state, used to
+        restore checkpointed values. The single argument a tuple containing
+        values for each composite symbol that may be modified in a branch of the
+        conditional. The is usually the result of a call to get_state.
 
   Returns:
     Tuple containing the statement outputs.
   """
   if tensor_util.is_tensor(cond):
-    return tf_if_stmt(cond, body, orelse)
+    return tf_if_stmt(cond, body, orelse, get_state, set_state)
   else:
     return _py_if_stmt(cond, body, orelse)
 
 
-def tf_if_stmt(cond, body, orelse):
+def tf_if_stmt(cond, body, orelse, get_state, set_state):
   """Overload of if_stmt that stages a TF cond."""
-  protected_body = _wrap_in_protection_from_undefined(body, branch_name='if')
-  protected_orelse = _wrap_in_protection_from_undefined(orelse,
-                                                        branch_name='else')
+  checkpointed_body = _wrap_in_state_isolation(body, get_state, set_state)
+  checkpointed_orelse = _wrap_in_state_isolation(orelse, get_state,
+                                                 set_state)
+  protected_body = _wrap_in_protection_from_undefined(
+      checkpointed_body, branch_name='if')
+  protected_orelse = _wrap_in_protection_from_undefined(
+      checkpointed_orelse, branch_name='else')
 
   return control_flow_ops.cond(cond, protected_body, protected_orelse)
 
 
+def _wrap_in_state_isolation(func, get_state, set_state):
+  """Wraps function to checkpoint the value of modified composites."""
+  def checkpoint_func():
+    init_values = get_state()
+    ret_values = func()
+    set_state(init_values)
+    return ret_values
+
+  return checkpoint_func
+
+
 def _wrap_in_protection_from_undefined(func, branch_name):
   """Wraps function to raise useful error when it returns undefined symbols."""
   def protected_func():
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index e5af283..e17d548 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -118,11 +118,20 @@
 class IfStmtTest(test.TestCase):
 
   def single_return_if_stmt(self, cond):
-    return control_flow.if_stmt(cond=cond, body=lambda: 1, orelse=lambda: -1)
+    return control_flow.if_stmt(
+        cond=cond,
+        body=lambda: 1,
+        orelse=lambda: -1,
+        get_state=lambda: (),
+        set_state=lambda _: None)
 
   def multi_return_if_stmt(self, cond):
     return control_flow.if_stmt(
-        cond=cond, body=lambda: (1, 2), orelse=lambda: (-1, -2))
+        cond=cond,
+        body=lambda: (1, 2),
+        orelse=lambda: (-1, -2),
+        get_state=lambda: (),
+        set_state=lambda _: None)
 
   @test_util.run_deprecated_v1
   def test_tensor(self):
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
index 58663d2..5b3bc43 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
@@ -74,8 +74,7 @@
         source_file=None,
         namespace=None,
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
     return transformer.Context(entity_info)
 
   def test_basic(self):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
index 595e95b..0dddb44 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
@@ -118,8 +118,7 @@
         source_file=None,
         namespace={},
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
     node = qual_names.resolve(node)
     ctx = transformer.Context(entity_info)
     node = activity.resolve(node, ctx)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py b/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
index a8d4e25..f8ae3d6 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
@@ -47,8 +47,7 @@
         source_file=None,
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types,
-        owner_type=None)
+        arg_types=arg_types)
     node = qual_names.resolve(node)
     graphs = cfg.build(node)
     ctx = transformer.Context(entity_info)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
index f14b1a3..904386b 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
@@ -39,8 +39,7 @@
         source_file=None,
         namespace={},
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
     node = qual_names.resolve(node)
     ctx = transformer.Context(entity_info)
     node = activity.resolve(node, ctx)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index 848c546..3fb9364 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -39,8 +39,7 @@
         source_file=None,
         namespace={},
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
     node = qual_names.resolve(node)
     ctx = transformer.Context(entity_info)
     node = activity.resolve(node, ctx)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py b/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
index c6cf91e..2263667 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
@@ -68,8 +68,7 @@
         source_file=None,
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types,
-        owner_type=None)
+        arg_types=arg_types)
     node = qual_names.resolve(node)
     graphs = cfg.build(node)
     ctx = transformer.Context(entity_info)
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index d8e093c..1507192 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -59,18 +59,16 @@
       parameters).
     arg_values: dict[str->*], containing parameter values, if known.
     arg_types: dict[str->*], containing parameter types, if known.
-    owner_type: The surrounding class type of the function, if present.
   """
 
   # TODO(mdan): Remove the default and update tests.
-  def __init__(self, source_code, source_file, namespace, arg_values, arg_types,
-               owner_type):
+  def __init__(self, source_code, source_file, namespace, arg_values,
+               arg_types):
     self.source_code = source_code
     self.source_file = source_file
     self.namespace = namespace
     self.arg_values = {} if arg_values is None else arg_values
     self.arg_types = {} if arg_types is None else arg_types
-    self.owner_type = owner_type
 
 
 class _StateStack(object):
diff --git a/tensorflow/python/autograph/pyct/transformer_test.py b/tensorflow/python/autograph/pyct/transformer_test.py
index d97c1f0..9d83653 100644
--- a/tensorflow/python/autograph/pyct/transformer_test.py
+++ b/tensorflow/python/autograph/pyct/transformer_test.py
@@ -34,8 +34,7 @@
         source_file=None,
         namespace=None,
         arg_values=None,
-        arg_types=None,
-        owner_type=None)
+        arg_types=None)
     return transformer.Context(entity_info)
 
   def test_entity_scope_tracking(self):
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 68b253b..db57f64 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -27,7 +27,7 @@
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 3, 11)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 3, 12)
 
 
 @tf_export("compat.forward_compatible")
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 92f0009..5cb9544 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -67,19 +67,12 @@
     ],
 )
 
-# TODO(aaroey): this wrapper has been causing troubles of double linking, so
-# either get rid of it, or split to make it contain minimum dependencies.
 tf_py_wrap_cc(
     name = "wrap_conversion",
     srcs = ["trt_conversion.i"],
     copts = tf_copts(),
-    swig_includes = [
-        "//tensorflow/python:platform/base.i",
-    ],
     deps = [
         "//tensorflow/compiler/tf2tensorrt:py_utils",
-        "//tensorflow/compiler/tf2tensorrt:trt_conversion",
-        "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
         "//third_party/python_runtime:headers",
     ],
 )
diff --git a/tensorflow/python/compiler/tensorrt/trt_conversion.i b/tensorflow/python/compiler/tensorrt/trt_conversion.i
index db4bb99..d6e8eac 100644
--- a/tensorflow/python/compiler/tensorrt/trt_conversion.i
+++ b/tensorflow/python/compiler/tensorrt/trt_conversion.i
@@ -17,8 +17,6 @@
 %{
 #define SWIG_FILE_WITH_INIT
 %}
-%include "std_string.i"
-%include "tensorflow/python/platform/base.i"
 
 %{
 struct version_struct{
@@ -42,8 +40,6 @@
 
 %}
 
-_LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
-
 %typemap(out) version_struct {
   PyObject *tuple = version_helper(&$1);
   if (!tuple) SWIG_fail;
@@ -54,10 +50,10 @@
 #include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
 %}
 
-%ignoreall
-%unignore get_linked_tensorrt_version;
-%unignore get_loaded_tensorrt_version;
-%unignore is_tensorrt_enabled;
+%ignore "";
+%rename("%s") get_linked_tensorrt_version;
+%rename("%s") get_loaded_tensorrt_version;
+%rename("%s") is_tensorrt_enabled;
 
 %{
 
@@ -87,4 +83,4 @@
 version_struct get_loaded_tensorrt_version();
 bool is_tensorrt_enabled();
 
-%unignoreall
+%rename("%s") "";
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index f7103a6..a5da41b 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -48,7 +48,6 @@
 @@copy_to_device
 @@dense_to_sparse_batch
 @@enumerate_dataset
-@@filter_for_shard
 @@get_next_as_optional
 @@get_single_element
 @@group_by_reducer
@@ -92,7 +91,6 @@
 from tensorflow.python.data.experimental.ops.counter import Counter
 from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
 from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
-from tensorflow.python.data.experimental.ops.filter_for_shard_ops import filter_for_shard
 from tensorflow.python.data.experimental.ops.get_single_element import get_single_element
 from tensorflow.python.data.experimental.ops.grouping import bucket_by_sequence_length
 from tensorflow.python.data.experimental.ops.grouping import group_by_reducer
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index f27d92e..d79d452 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -163,18 +163,6 @@
 )
 
 py_library(
-    name = "filter_for_shard_ops",
-    srcs = ["filter_for_shard_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ops",
-        "//tensorflow/python:tensor_util",
-    ],
-)
-
-py_library(
     name = "error_ops",
     srcs = ["error_ops.py"],
     srcs_version = "PY2AND3",
@@ -466,7 +454,6 @@
         ":distribute",
         ":enumerate_ops",
         ":error_ops",
-        ":filter_for_shard_ops",
         ":get_single_element",
         ":grouping",
         ":indexed_dataset_ops",
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index e251c94..5ad917e 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -665,6 +665,11 @@
   return _apply_fn
 
 
+@deprecation.deprecated(
+    None,
+    "Use `tf.data.Dataset.map(map_func, num_parallel_calls)` followed by "
+    "`tf.data.Dataset.batch(batch_size, drop_remainder)`. Static tf.data "
+    "optimizations will take care of using the fused implementation.")
 @tf_export("data.experimental.map_and_batch")
 def map_and_batch(map_func,
                   batch_size,
diff --git a/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py b/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
deleted file mode 100644
index 91d3dca..0000000
--- a/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Naive shard dataset transformation."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
-
-
-@tf_export("data.experimental.filter_for_shard")
-def filter_for_shard(num_shards, shard_index):
-  """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-  This dataset operator is very useful when running distributed training, as
-  it allows each worker to read a unique subset.
-
-  When reading a single input file, you can skip elements as follows:
-
-  ```python
-  d = tf.data.TFRecordDataset(FLAGS.input_file)
-  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
-                                               FLAGS.worker_index))
-  d = d.repeat(FLAGS.num_epochs)
-  d = d.shuffle(FLAGS.shuffle_buffer_size)
-  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-  ```
-
-  Important caveats:
-
-  - Be sure to shard before you use any randomizing operator (such as
-    shuffle).
-  - Generally it is best if the shard operator is used early in the dataset
-    pipeline. For example, when reading from a set of TFRecord files, shard
-    before converting the dataset to input samples. This avoids reading every
-    file on every worker. The following is an example of an efficient
-    sharding strategy within a complete pipeline:
-
-  ```python
-  d = Dataset.list_files(FLAGS.pattern)
-  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
-                                               FLAGS.worker_index))
-  d = d.repeat(FLAGS.num_epochs)
-  d = d.shuffle(FLAGS.shuffle_buffer_size)
-  d = d.interleave(tf.data.TFRecordDataset,
-                   cycle_length=FLAGS.num_readers, block_length=1)
-  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-  ```
-
-  Args:
-    num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-      shards operating in parallel.
-    shard_index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    `tf.data.Dataset.apply`.
-
-  Raises:
-    ValueError: if `num_shards` or `shard_index` are illegal values. Note: error
-      checking is done on a best-effort basis, and errors aren't guaranteed to
-      be caught upon dataset creation. (e.g. providing in a placeholder tensor
-      bypasses the early checking, and will instead result in an error during
-      a session.run call.)
-  """
-  num_shards = ops.convert_to_tensor(
-      num_shards, name="num_shards", dtype=dtypes.int64)
-  num_shards_static = tensor_util.constant_value(num_shards)
-  shard_index = ops.convert_to_tensor(shard_index, name="shard_index",
-                                      dtype=dtypes.int64)
-  shard_index_static = tensor_util.constant_value(shard_index)
-
-  if num_shards_static is not None and num_shards_static < 1:
-    raise ValueError("num_shards must be >= 1; got: %s" % num_shards_static)
-  if shard_index_static is not None and shard_index_static < 0:
-    raise ValueError("shard_index must be >= 0; got: %s" % shard_index_static)
-  if (shard_index_static is not None and num_shards_static is not None and
-      shard_index_static >= num_shards_static):
-    raise ValueError("shard_index must be < num_shards; %s is not < %s" %
-                     (shard_index_static, num_shards_static))
-
-  def filter_fn(elem_index, _):
-    mod_result = math_ops.mod(elem_index, num_shards)
-    return math_ops.equal(mod_result, shard_index)
-
-  def _apply_fn(dataset):
-    # pylint: disable=protected-access
-    return dataset._enumerate().filter(filter_fn).map(lambda _, elem: elem)
-
-  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index 231c6be..f54867b 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -28,6 +28,7 @@
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import gen_stateless_random_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -82,6 +83,11 @@
     return "tf.data.experimental.parallel_interleave()"
 
 
+@deprecation.deprecated(
+    None,
+    "Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, "
+    "num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy "
+    "execution is desired, use `tf.data.Options.experimental_determinstic`.")
 @tf_export("data.experimental.parallel_interleave")
 def parallel_interleave(map_func,
                         cycle_length,
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index c24d10a..38fc3c5 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -586,6 +586,7 @@
     ```
 
     We can construct a CsvDataset from it as follows:
+
     ```python
     tf.enable_eager_execution()
 
@@ -600,6 +601,7 @@
     ```
 
     The expected output of its iterations is:
+
     ```python
     for element in dataset:
       print(element)
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index 86a615d..98f682e 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -23,6 +23,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -50,6 +51,11 @@
                                                    variant_tensor)
 
 
+@deprecation.deprecated(
+    None,
+    "Use `tf.data.Dataset.shuffle(buffer_size, seed)` followed by "
+    "`tf.data.Dataset.repeat(count)`. Static tf.data optimizations will take "
+    "care of using the fused implementation.")
 @tf_export("data.experimental.shuffle_and_repeat")
 def shuffle_and_repeat(buffer_size, count=None, seed=None):
   """Shuffles and repeats a Dataset returning a new permutation for each epoch.
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index a911d8c..d018ba2 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -26,7 +26,6 @@
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
         "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/experimental/ops:stats_options",
         "//tensorflow/python/data/experimental/ops:threading_options",
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index a57f24a..62c40ea 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -368,7 +368,6 @@
     srcs = ["input_ops.py"],
     deps = [
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
         "//tensorflow/python/data/util:nest",
     ],
 )
diff --git a/tensorflow/python/distribute/input_ops.py b/tensorflow/python/distribute/input_ops.py
index d9e833b..5121bd9 100644
--- a/tensorflow/python/distribute/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -40,10 +40,9 @@
     dataset: A `tf.data.Dataset` instance, typically the result of a bunch of
       dataset transformations.
     num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel. Same usage as in
-        `tf.data.experimental.filter_for_shard`.
+        shards operating in parallel. Same usage as in `tf.data.Dataset.shard`.
     index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-      Same usage as in `Dataset.shard`.
+      Same usage as in `tf.data.Dataset.shard`.
 
   Returns:
     A modified `Dataset` obtained by updating the pipeline sharded by the
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index c78f743..61b372d 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -348,8 +348,11 @@
 
     # Put the while loop op on TPU host 0.
     with ops.device(self._host_device):
-      replicate_outputs = training_loop.repeat(iterations, rewrite_fn,
-                                               initial_loop_values)
+      if self.steps_per_run == 1:
+        replicate_outputs = rewrite_fn()
+      else:
+        replicate_outputs = training_loop.repeat(iterations, rewrite_fn,
+                                                 initial_loop_values)
 
     del self._outer_control_flow_context
     ctx.run_op = control_flow_ops.group(replicate_outputs)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index dc068e1..24a954e 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -773,12 +773,10 @@
 
   @soft_device_placement.setter
   def soft_device_placement(self, enabled):
-    if self._context_handle is not None:
-      raise RuntimeError(
-          "Soft placement must be set at program startup")
-
     self._config.allow_soft_placement = enabled
 
+    self._thread_local_data.function_call_options = None
+
   @property
   def log_device_placement(self):
     return self._config.log_device_placement
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index c873cac..0b9593c 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -678,9 +678,7 @@
           return autograph.converted_call(
               original_func, None,
               autograph.ConversionOptions(
-                  verbose=autograph.Verbosity.BRIEF,
                   recursive=True,
-                  strip_decorators=(def_function.function,),
                   optional_features=autograph_options,
                   force_conversion=True,
               ), args, kwargs)
diff --git a/tensorflow/python/framework/ops_enable_eager_test.py b/tensorflow/python/framework/ops_enable_eager_test.py
index 99d06f1..4da0798 100644
--- a/tensorflow/python/framework/ops_enable_eager_test.py
+++ b/tensorflow/python/framework/ops_enable_eager_test.py
@@ -23,9 +23,10 @@
 from tensorflow.python.platform import googletest
 
 
-class OpsEnableEagerTest(googletest.TestCase):
+class OpsEnableAndDisableEagerTest(googletest.TestCase):
 
-  def test_enable_eager_execution_multiple_times(self):
+  def setUp(self):
+    # test for enable eager test
     ops.enable_eager_execution()
     self.assertTrue(context.executing_eagerly())
 
@@ -33,6 +34,15 @@
     ops.enable_eager_execution()
     self.assertTrue(context.executing_eagerly())
 
+  def tearDown(self):
+    # test for disable eager test
+    ops.disable_eager_execution()
+    self.assertFalse(context.executing_eagerly())
+
+    # Calling disable eager execution a second time should not cause an error.
+    ops.disable_eager_execution()
+    self.assertFalse(context.executing_eagerly())
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 97e5a99..6d28c97 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -650,7 +650,7 @@
     return None
 
   # Note: this function is meant to help with diagnostics. Its output is purely
-  # a human readable representation, so you may freely modify it to suit your
+  # a human-readable representation, so you may freely modify it to suit your
   # needs.
   def describe(obj, blacklist, leaves_only=False):
     """Returns a custom human-readable summary of obj.
@@ -1202,7 +1202,7 @@
 def run_gpu_only(func=None):
   """Execute the decorated test only if a GPU is available.
 
-  This function is intended to be applied to tests that require the precense
+  This function is intended to be applied to tests that require the presence
   of a GPU. If a GPU is absent, it will simply be skipped.
 
   Args:
@@ -1275,7 +1275,7 @@
       CUDA compute capability required, or None if no requirement.
 
   Returns:
-    True iff a gpu device of the requested kind is available.
+    True if a gpu device of the requested kind is available.
   """
 
   def compute_capability_from_device_desc(device_desc):
@@ -1376,7 +1376,7 @@
 
   Since the feed_dict is empty when not using placeholders we should be able to
   call self.evaluate(), however this requires rewriting the test case.
-  This class shold be considered a stop-gap solution to get tests running with
+  This class should be considered a stop-gap solution to get tests running with
   eager with minimal changes to the actual test.
   """
 
@@ -2177,7 +2177,7 @@
 
   @py_func_if_in_function
   def assertNotAllClose(self, a, b, **kwargs):
-    """Assert that two numpy arrays, or or Tensors, do not have near values.
+    """Assert that two numpy arrays, or Tensors, do not have near values.
 
     Args:
       a: the first value to compare.
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index c1c94dc..dde3352 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -333,6 +333,7 @@
         "layers/normalization.py",
         "layers/pooling.py",
         "layers/recurrent.py",
+        "layers/recurrent_v2.py",
         "layers/serialization.py",
         "layers/wrappers.py",
         "utils/kernelized_utils.py",
@@ -798,9 +799,9 @@
 )
 
 cuda_py_test(
-    name = "unified_lstm_test",
+    name = "lstm_v2_test",
     size = "medium",
-    srcs = ["layers/unified_lstm_test.py"],
+    srcs = ["layers/lstm_v2_test.py"],
     additional_deps = [
         ":keras",
         "@absl_py//absl/testing:parameterized",
@@ -812,9 +813,9 @@
 )
 
 cuda_py_test(
-    name = "unified_gru_test",
+    name = "gru_v2_test",
     size = "medium",
-    srcs = ["layers/unified_gru_test.py"],
+    srcs = ["layers/gru_v2_test.py"],
     additional_deps = [
         ":keras",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index a10629a..3f5f125 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -165,16 +165,41 @@
 
 @keras_export('keras.activations.tanh')
 def tanh(x):
+  """Hyperbolic Tangent activation function.
+
+  Arguments:
+      x: Input tensor.
+
+  Returns:
+      The tanh activation: `tanh(x) = sinh(x)/cosh(x) = ((exp(x) -
+      exp(-x))/(exp(x) + exp(-x)))`.
+  """
   return nn.tanh(x)
 
 
 @keras_export('keras.activations.sigmoid')
 def sigmoid(x):
+  """Sigmoid activation function.
+
+  Arguments:
+      x: Input tensor.
+
+  Returns:
+      The sigmoid activation: `(1.0 / (1.0 + exp(-x)))`.
+  """
   return nn.sigmoid(x)
 
 
 @keras_export('keras.activations.exponential')
 def exponential(x):
+  """Exponential activation function.
+
+  Arguments:
+      x: Input tensor.
+
+  Returns:
+      The exponential activation: `exp(x)`.
+  """
   return math_ops.exp(x)
 
 
@@ -198,6 +223,14 @@
 
 @keras_export('keras.activations.linear')
 def linear(x):
+  """Linear activation function.
+
+  Arguments:
+      x: Input tensor.
+
+  Returns:
+      The linear activation: `x`.
+  """
   return x
 
 
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index bf38d76..5951b41 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -35,12 +35,12 @@
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_distributed
 from tensorflow.python.keras.engine import training_eager
 from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import losses_utils
@@ -54,7 +54,7 @@
 
 
 @keras_export('keras.models.Model', 'keras.Model')
-class Model(Network):
+class Model(network.Network):
   """`Model` groups layers into an object with training and inference features.
 
   There are two ways to instantiate a `Model`:
@@ -141,6 +141,15 @@
         return super(Model, self).get_weights()
     return super(Model, self).get_weights()
 
+  def load_weights(self, filepath, by_name=False):
+    """Loads all layer weights, either from a TensorFlow or an HDF5 file."""
+    if distributed_training_utils.is_tpu_strategy(self._distribution_strategy):
+      if (self._distribution_strategy.extended.steps_per_run > 1 and
+          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
+        raise ValueError('Load weights is not yet supported with TPUStrategy '
+                         'with steps_per_run greater than 1.')
+    return super(Model, self).load_weights(filepath, by_name)
+
   @trackable.no_automatic_dependency_tracking
   def compile(self,
               optimizer,
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 8689e0d..b3b0298 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -152,10 +152,11 @@
 from tensorflow.python.keras.layers.recurrent import LSTMCell
 from tensorflow.python.keras.layers.recurrent import PeepholeLSTMCell
 from tensorflow.python.keras.layers.recurrent import SimpleRNN
+
 from tensorflow.python.keras.layers.recurrent import GRU
 from tensorflow.python.keras.layers.recurrent import LSTM
-from tensorflow.python.keras.layers.recurrent import UnifiedGRU
-from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
+from tensorflow.python.keras.layers.recurrent_v2 import GRU as GRU_v2
+from tensorflow.python.keras.layers.recurrent_v2 import LSTM as LSTM_v2
 
 # Convolutional-recurrent layers.
 from tensorflow.python.keras.layers.convolutional_recurrent import ConvLSTM2D
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 672784c..11f78e8 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -46,6 +46,7 @@
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -644,7 +645,18 @@
 
 @keras_export('keras.layers.Lambda')
 class Lambda(Layer):
-  """Wraps arbitrary expression as a `Layer` object.
+  """Wraps arbitrary expressions as a `Layer` object.
+
+  The `Lambda` layer exists so that aribtrary TensorFlow functions
+  can be used when constructing `Sequential` and Functional API
+  models. `Lambda` layers are best suited for simple operations or
+  quick experimentation. For more advanced use cases, subclassing
+  `keras.layers.Layer` is preferred. One reason for this is that
+  when saving a Model, `Lambda` layers are saved by serializing the
+  Python bytecode, whereas subclassed Layers are saved via overriding
+  their `get_config` method and are thus more portable. Models that rely
+  on subclassed Layers are also often easier to visualize and reason
+  about.
 
   Examples:
 
@@ -667,30 +679,49 @@
   model.add(Lambda(antirectifier))
   ```
 
+  Variables can be created within a `Lambda` layer. Like with
+  other layers, these variables will be created only once and reused
+  if the `Lambda` layer is called on new inputs. If creating more
+  than one variable in a given `Lambda` instance, be sure to use
+  a different name for each variable. Note that calling sublayers
+  from within a `Lambda` is not supported.
+
+  Example of variable creation:
+
+  ```python
+  def linear_transform(x):
+    v1 = tf.Variable(1., name='multiplier')
+    v2 = tf.Variable(0., name='bias')
+    return x*v1 + v2
+
+  linear_layer = Lambda(linear_transform)
+  model.add(linear_layer)
+  model.add(keras.layers.Dense(10, activation='relu'))
+  model.add(linear_layer)  # Reuses existing Variables
+  ```
+
+  Note that creating two instances of `Lambda` using the same function
+  will *not* share Variables between the two instances. Each instance of
+  `Lambda` will create and manage its own weights.
+
   Arguments:
-    function: The function to be evaluated.
-      Takes input tensor as first argument.
-    output_shape: Expected output shape from function.
-      This argument can be inferred if not explicitly provided.
-      Can be a tuple or function.
-      If a tuple, it only specifies the first dimension onward;
-      sample dimension is assumed either the same as the input:
-      `output_shape = (input_shape[0], ) + output_shape`
-      or, the input is `None` and
-      the sample dimension is also `None`:
-      `output_shape = (None, ) + output_shape`
-      If a function, it specifies the entire shape as a function of the
+    function: The function to be evaluated. Takes input tensor as first
+      argument.
+    output_shape: Expected output shape from function. This argument can be
+      inferred if not explicitly provided. Can be a tuple or function. If a
+      tuple, it only specifies the first dimension onward;
+      sample dimension is assumed either the same as the input: `output_shape =
+        (input_shape[0], ) + output_shape` or, the input is `None` and
+      the sample dimension is also `None`: `output_shape = (None, ) +
+        output_shape` If a function, it specifies the entire shape as a function
+        of the
       input shape: `output_shape = f(input_shape)`
-    arguments: Optional dictionary of keyword arguments to be passed
-      to the function.
-
-  Input shape:
-    Arbitrary. Use the keyword argument input_shape
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Specified by `output_shape` argument
+    arguments: Optional dictionary of keyword arguments to be passed to the
+      function.
+  Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
+    integers, does not include the samples axis) when using this layer as the
+    first layer in a model.
+  Output shape: Specified by `output_shape` argument
   """
 
   def __init__(self, function, output_shape=None, mask=None, arguments=None,
@@ -702,6 +733,10 @@
       self.supports_masking = True
     self.mask = mask
     self._output_shape = output_shape
+    self._variable_dict = {}
+    # These attributes are inherited from `Layer`.
+    self._trainable_weights = []
+    self._non_trainable_weights = []
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -736,7 +771,21 @@
     arguments = self.arguments
     if generic_utils.has_arg(self.function, 'mask'):
       arguments['mask'] = mask
-    return self.function(inputs, **arguments)
+    with variable_scope.variable_creator_scope(self._variable_creator):
+      return self.function(inputs, **arguments)
+
+  def _variable_creator(self, next_creator, **kwargs):
+    name = kwargs['name']
+    if name in self._variable_dict:
+      return self._variable_dict[name]
+    var = next_creator(**kwargs)
+    self._variable_dict[name] = var
+    if var.trainable:
+      self._trainable_weights.append(var)
+    else:
+      self._non_trainable_weights.append(var)
+    K.track_variable(var)
+    return var
 
   def compute_mask(self, inputs, mask=None):
     if callable(self.mask):
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index afa00d9..92ddaa9 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -27,6 +27,7 @@
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -192,6 +193,40 @@
     })
     layer = keras.layers.Lambda.from_config(config)
 
+  def test_lambda_with_variable(self):
+
+    def fn(x):
+      return x * variables.Variable(2., name='multiplier')
+
+    layer = keras.layers.Lambda(fn)
+    for _ in range(10):
+      layer(np.ones((10, 10), 'float32'))
+    self.assertLen(layer.trainable_weights, 1)
+    self.assertEqual(layer.trainable_weights[0].name, 'lambda/multiplier:0')
+
+
+class TestStatefulLambda(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_with_all_model_types
+  def test_lambda_with_variable_in_model(self):
+
+    def lambda_fn(x):
+      # Variable will only get created once.
+      v = variables.Variable(1., trainable=True)
+      return x * v
+
+    model = testing_utils.get_model_from_layers(
+        [keras.layers.Lambda(lambda_fn)], input_shape=(10,))
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.1),
+        'mae',
+        run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 10), 'float32'), 2 * np.ones((10, 10), 'float32')
+    model.fit(x, y, batch_size=2, epochs=2, validation_data=(x, y))
+    self.assertLen(model.trainable_weights, 1)
+    self.assertAllClose(keras.backend.get_value(model.trainable_weights[0]), 2.)
+
 
 @keras_parameterized.run_all_keras_modes
 class CoreLayersTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index a74308f..193447c 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -26,7 +26,7 @@
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine.input_spec import InputSpec
-from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.keras.layers import recurrent_v2
 from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
@@ -275,7 +275,7 @@
     input_h = initial_state[0]
     input_h = array_ops.expand_dims(input_h, axis=0)
 
-    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
+    params = recurrent_v2._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, self.units:self.units * 2],
             self.kernel[:, :self.units],
@@ -470,7 +470,7 @@
     input_h = array_ops.expand_dims(input_h, axis=0)
     input_c = array_ops.expand_dims(input_c, axis=0)
 
-    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
+    params = recurrent_v2._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, :self.units],
             self.kernel[:, self.units:self.units * 2],
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index 1adf9d8..ffa265a 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -36,9 +36,40 @@
 
   Implementations of attention mechanisms should inherit from this class, and
   reuse the `apply_attention_scores()` method.
+
+  Call Arguments:
+
+    inputs: List of the following tensors:
+      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
+      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
+      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
+        given, will use `value` for both `key` and `value`, which is the
+        most common case.
+    mask: List of the following tensors:
+      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
+        If given, the output will be zero at the positions where
+        `mask==False`.
+      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
+        If given, will apply the mask such that values at positions where
+        `mask==False` do not contribute to the result.
+
+  Output shape:
+
+    Attention outputs of shape `[batch_size, Tq, dim]`.
   """
 
-  def apply_attention_scores(self, scores, value, value_mask=None):
+  def _calculate_scores(self, query, key):
+    """Calculates attention scores.
+
+    Args:
+      query: Query tensor of shape `[batch_size, Tq, dim]`.
+      key: Key tensor of shape `[batch_size, Tv, dim]`.
+    Returns:
+      Tensor of shape `[batch_size, Tq, Tv]`.
+    """
+    return NotImplementedError
+
+  def _apply_scores(self, scores, value, value_mask=None):
     """Applies attention scores to the given value tensor.
 
     To use this method in your attention layer, follow the steps:
@@ -68,3 +99,156 @@
       scores -= 1.e9 * math_ops.cast(padding_mask, dtype=K.floatx())
     attention_distribution = nn.softmax(scores)
     return math_ops.matmul(attention_distribution, value)
+
+  # TODO(b/125916026): Consider exposing a __call__ method with named args.
+  def call(self, inputs, mask=None):
+    self._validate_call_args(inputs=inputs, mask=mask)
+    q = inputs[0]
+    v = inputs[1]
+    k = inputs[2] if len(inputs) > 2 else v
+    q_mask = mask[0] if mask else None
+    v_mask = mask[1] if mask else None
+    # TODO(b/125916026): Support query_mask.
+    if q_mask is not None:
+      raise NotImplementedError('query_mask is not supported yet.')
+    scores = self._calculate_scores(query=q, key=k)
+    return self._apply_scores(scores=scores, value=v, value_mask=v_mask)
+
+  def _validate_call_args(self, inputs, mask):
+    """Validates arguments of the call method."""
+    class_name = self.__class__.__name__
+    if not isinstance(inputs, list):
+      raise ValueError(
+          '{} layer must be called on a list of inputs, namely [query, value] '
+          'or [query, value, key].'.format(class_name))
+    if len(inputs) < 2 or len(inputs) > 3:
+      raise ValueError(
+          '{} layer accepts inputs list of length 2 or 3, '
+          'namely [query, value] or [query, value, key]. '
+          'Given length: {}'.format(class_name, len(inputs)))
+    if mask:
+      if not isinstance(mask, list):
+        raise ValueError(
+            '{} layer mask must be a list, '
+            'namely [query_mask, value_mask].'.format(class_name))
+      if len(mask) != 2:
+        raise ValueError(
+            '{} layer mask must be a list of length 2, namely [query_mask, '
+            'value_mask]. Given length: {}'.format(class_name, len(mask)))
+
+
+class Attention(BaseDenseAttention):
+  """Dot-product attention layer, a.k.a. Luong-style attention.
+
+  Inputs are `query` tensor of shape `[batch_size, Tq]`, `value` tensor of shape
+  `[batch_size, Tv]` and `key` tensor of shape `[batch_size, Tv]`.
+  The calculation follows the steps:
+
+  1. Calculate scores with shape `[batch_size, Tq, Tv]` as a `query`-`key` dot
+     product: `scores = tf.matmul(query, key, transpose_b=True)`.
+  2. Use scores to calculate a distribution with shape
+     `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
+  3. Use `distribution` to create a linear combination of `value` with
+     shape `batch_size, Tq, dim]`:
+     `return tf.matmul(distribution, value)`.
+
+  Args:
+    scale: If `True`, will create a scalar variable to scale the attention
+      scores.
+
+  Call Arguments:
+
+    inputs: List of the following tensors:
+      * query: Query `Tensor` of shape `[batch_size, Tq, dim]`.
+      * value: Value `Tensor` of shape `[batch_size, Tv, dim]`.
+      * key: Optional key `Tensor` of shape `[batch_size, Tv, dim]`. If not
+        given, will use `value` for both `key` and `value`, which is the
+        most common case.
+    mask: List of the following tensors:
+      * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
+        If given, the output will be zero at the positions where
+        `mask==False`.
+      * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
+        If given, will apply the mask such that values at positions where
+        `mask==False` do not contribute to the result.
+
+  Output shape:
+
+    Attention outputs of shape `[batch_size, Tq, dim]`.
+
+  The meaning of `query`, `value` and `key` depend on the application. In the
+  case of text similarity, for example, `query` is the sequence embeddings of
+  the first piece of text and `value` is the sequence embeddings of the second
+  piece of text. `key` is usually the same tensor as `value`.
+
+  Here is a code example for using `Attention` in a CNN+Attention network:
+
+  ```python
+  # Variable-length int sequences.
+  query_input = tf.keras.Input(shape=(None,), dtype='int32')
+  value_input = tf.keras.Input(shape=(None,), dtype='int32')
+
+  # Embedding lookup.
+  token_embedding = tf.keras.layers.Embedding(max_tokens, dimension)
+  # Query embeddings of shape [batch_size, Tq, dimension].
+  query_embeddings = token_embedding(query_input)
+  # Value embeddings of shape [batch_size, Tv, dimension].
+  value_embeddings = token_embedding(query_input)
+
+  # CNN layer.
+  cnn_layer = tf.keras.layers.Conv1D(
+      filters=100,
+      kernel_size=4,
+      # Use 'same' padding so outputs have the same shape as inputs.
+      padding='same')
+  # Query encoding of shape [batch_size, Tq, filters].
+  query_seq_encoding = cnn_layer(query_embeddings)
+  # Value encoding of shape [batch_size, Tv, filters].
+  value_seq_encoding = cnn_layer(value_embeddings)
+
+  # Query-value attention of shape [batch_size, Tq, filters].
+  query_value_attention_seq = tf.keras.layers.Attention()(
+      [query_seq_encoding, value_seq_encoding])
+
+  # Reduce over the sequence axis to produce encodings of shape
+  # [batch_size, filters].
+  query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
+      query_seq_encoding)
+  query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
+      query_value_attention_seq)
+
+  # Concatenate query and document encodings to produce a DNN input layer.
+  input_layer = tf.keras.layers.Concatenate()(
+      [query_encoding, query_value_attention])
+
+  # Add DNN layers, and create Model.
+  # ...
+  ```
+  """
+
+  def __init__(self, scale=False, **kwargs):
+    super(Attention, self).__init__(**kwargs)
+    # TODO(b/125916026): Support scale.
+    if scale:
+      raise NotImplementedError('scale=True is not supported yet.')
+    self.scale = scale
+
+  def build(self, input_shape):
+    """Creates scale variable if scale==True."""
+    # TODO(b/125916026): Create scale variable if self.scale is True.
+    self.scale_var = None
+    super(Attention, self).build(input_shape)
+
+  def _calculate_scores(self, query, key):
+    """Calculates attention scores as a query-key dot product.
+
+    Args:
+      query: Query tensor of shape `[batch_size, Tq, dim]`.
+      key: Key tensor of shape `[batch_size, Tv, dim]`.
+    Returns:
+      Tensor of shape `[batch_size, Tq, Tv]`.
+    """
+    scores = math_ops.matmul(query, key, transpose_b=True)
+    if self.scale_var is not None:
+      scores *= self.scale_var
+    return scores
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 0d7ebf4..adae877 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -22,6 +22,7 @@
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import dense_attention
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -35,7 +36,7 @@
     v = np.array([[[1.6]]], dtype=np.float32)
     # Value mask tensor of shape [1, 1]
     v_mask = np.array([[True]], dtype=np.bool_)
-    actual = dense_attention.BaseDenseAttention().apply_attention_scores(
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v, value_mask=v_mask)
 
     # Expected tensor of shape [1, 1, 1].
@@ -48,7 +49,7 @@
     scores = np.array([[[1.1]]], dtype=np.float32)
     # Value tensor of shape [1, 1, 1]
     v = np.array([[[1.6]]], dtype=np.float32)
-    actual = dense_attention.BaseDenseAttention().apply_attention_scores(
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v)
 
     # Expected tensor of shape [1, 1, 1].
@@ -63,7 +64,7 @@
     v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
     # Value mask tensor of shape [1, 3]
     v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    actual = dense_attention.BaseDenseAttention().apply_attention_scores(
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v, value_mask=v_mask)
 
     # Expected attention distribution = softmax(scores) with zeros in
@@ -83,7 +84,7 @@
     scores = np.array([[[1., 0., 1.]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 1]
     v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
-    actual = dense_attention.BaseDenseAttention().apply_attention_scores(
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v)
 
     # Expected attention distribution = softmax(scores).
@@ -108,7 +109,7 @@
     v = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
     # Value mask tensor of shape [2, 1]
     v_mask = np.array([[True], [True]], dtype=np.bool_)
-    actual = dense_attention.BaseDenseAttention().apply_attention_scores(
+    actual = dense_attention.BaseDenseAttention()._apply_scores(
         scores=scores, value=v, value_mask=v_mask)
 
     # Expected tensor of shape [2, 1, 1].
@@ -118,5 +119,211 @@
     self.assertAllClose(expected, actual)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class AttentionTest(test.TestCase):
+
+  def test_calculate_scores_one_dim(self):
+    # Query tensor of shape [1, 1, 1]
+    q = np.array([[[1.1]]], dtype=np.float32)
+    # Key tensor of shape [1, 1, 1]
+    k = np.array([[[1.6]]], dtype=np.float32)
+    attention_layer = dense_attention.Attention()
+    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+    actual = attention_layer._calculate_scores(query=q, key=k)
+
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 1.1*1.6 = 1.76
+    expected = np.array([[[1.76]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_calculate_scores_multi_dim(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Key tensor of shape [1, 3, 4]
+    k = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    attention_layer = dense_attention.Attention()
+    attention_layer.build(input_shape=([1, 2, 4], [1, 3, 4]))
+    actual = attention_layer._calculate_scores(query=q, key=k)
+
+    # Expected tensor of shape [1, 2, 3].
+    # expected000 = 1.*1.5+1.1*1.6+1.2*1.7+1.3*1.8 = 7.64
+    # expected001 = 1.*2.5+1.1*2.6+1.2*2.7+1.3*2.8 = 12.24
+    # expected002 = 1.*3.5+1.1*3.6+1.2*3.7+1.3*3.8 = 16.84
+    # expected010 = 2.*1.5+2.1*1.6+2.2*1.7+2.3*1.8 = 14.24
+    # expected011 = 2.*2.5+2.1*2.6+2.2*2.7+2.3*2.8 = 22.84
+    # expected012 = 2.*3.5+2.1*3.6+2.2*3.7+2.3*3.8 = 31.44
+    expected = np.array(
+        [[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_calculate_scores_one_dim_batch_size_two(self):
+    # Query tensor of shape [2, 1, 1]
+    q = np.array([[[1.1]], [[2.1]]], dtype=np.float32)
+    # Key tensor of shape [2, 1, 1]
+    k = np.array([[[1.6]], [[2.6]]], dtype=np.float32)
+    attention_layer = dense_attention.Attention()
+    attention_layer.build(input_shape=([2, 1, 1], [2, 1, 1]))
+    actual = attention_layer._calculate_scores(query=q, key=k)
+
+    # Expected tensor of shape [2, 1, 1].
+    # expected000 = 1.1*1.6 = 1.76
+    # expected100 = 2.1*2.6 = 5.46
+    expected = np.array([[[1.76]], [[5.46]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_shape(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 4]
+    v = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.Attention()
+    actual = attention_layer([q, v], mask=[None, v_mask])
+
+    expected_shape = [1, 2, 4]
+    self.assertAllEqual(expected_shape, array_ops.shape(actual))
+
+  def test_shape_with_key(self):
+    # Query tensor of shape [1, 2, 4]
+    q = np.array(
+        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 4]
+    v = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Key tensor of shape [1, 3, 4]
+    k = np.array(
+        [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
+        dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.Attention()
+    actual = attention_layer([q, v, k], mask=[None, v_mask])
+
+    expected_shape = [1, 2, 4]
+    self.assertAllEqual(expected_shape, array_ops.shape(actual))
+
+  def test_multi_dim(self):
+    # Query tensor of shape [1, 1, 1]
+    q = np.array([[[1.1]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 1]
+    v = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.Attention()
+    actual = attention_layer([q, v], mask=[None, v_mask])
+
+    # Expected scores of shape [1, 1, 3]
+    # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8]]] = [[[1.76, 0.77, -0.88]]]
+    # Expected attention distribution = softmax(scores) with zeros in
+    # positions where v_mask == False.
+    # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
+    #                              = 0.72908792234
+    #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
+    #                              = 0.27091207765
+    #    attention_distribution002 = 0
+    #
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
+    #             = 1.3561791301
+    expected = np.array([[[1.3561791301]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_multi_dim_with_key(self):
+    # Query tensor of shape [1, 1, 1]
+    q = np.array([[[1.1]]], dtype=np.float32)
+    # Value tensor of shape [1, 3, 1]
+    v = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
+    # Key tensor of shape [1, 3, 1]
+    k = np.array([[[1.6], [0.7], [-0.8]]], dtype=np.float32)
+    # Value mask tensor of shape [1, 3]
+    v_mask = np.array([[True, True, False]], dtype=np.bool_)
+    attention_layer = dense_attention.Attention()
+    actual = attention_layer([q, v, k], mask=[None, v_mask])
+
+    # Expected scores of shape [1, 1, 3]
+    # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8]]] = [[[1.76, 0.77, -0.88]]]
+    # Expected attention distribution = softmax(scores) with zeros in
+    # positions where v_mask == False.
+    # => attention_distribution000 = exp(1.76)/(exp(1.76) + exp(0.77))
+    #                              = 0.72908792234
+    #    attention_distribution001 = exp(0.77)/(exp(1.76) + exp(0.77))
+    #                              = 0.27091207765
+    #    attention_distribution002 = 0
+    #
+    # Expected tensor of shape [1, 1, 1].
+    # expected000 = 0.72908792234 * 0.5 + 0.27091207765 * 0.8 - 0 * 0.3
+    #             = 0.58127362329
+    expected = np.array([[[0.58127362329]]], dtype=np.float32)
+    self.assertAllClose(expected, actual)
+
+  def test_scale_not_implemented(self):
+    with self.assertRaisesRegexp(
+        NotImplementedError, 'scale=True is not supported yet'):
+      dense_attention.Attention(scale=True)
+
+  def test_query_mask_not_implemented(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    mask = np.array([[True]], dtype=np.bool_)
+    with self.assertRaisesRegexp(
+        NotImplementedError, 'query_mask is not supported yet'):
+      attention_layer([q, q], mask=[mask, mask])
+
+  def test_inputs_not_list(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    with self.assertRaisesRegexp(
+        ValueError, 'Attention layer must be called on a list of inputs'):
+      attention_layer(q)
+
+  def test_inputs_too_short(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Attention layer accepts inputs list of length 2 or 3'):
+      attention_layer([q])
+
+  def test_inputs_too_long(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Attention layer accepts inputs list of length 2 or 3'):
+      attention_layer([q, q, q, q])
+
+  def test_mask_not_list(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    mask = np.array([[True]], dtype=np.bool_)
+    with self.assertRaisesRegexp(
+        ValueError, 'Attention layer mask must be a list'):
+      attention_layer([q, q], mask=mask)
+
+  def test_mask_too_short(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    mask = np.array([[True]], dtype=np.bool_)
+    with self.assertRaisesRegexp(
+        ValueError, 'Attention layer mask must be a list of length 2'):
+      attention_layer([q, q], mask=[mask])
+
+  def test_mask_too_long(self):
+    attention_layer = dense_attention.Attention()
+    q = np.array([[[1.1]]], dtype=np.float32)
+    mask = np.array([[True]], dtype=np.bool_)
+    with self.assertRaisesRegexp(
+        ValueError, 'Attention layer mask must be a list of length 2'):
+      attention_layer([q, q], mask=[mask, mask, mask])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/unified_gru_test.py b/tensorflow/python/keras/layers/gru_v2_test.py
similarity index 88%
rename from tensorflow/python/keras/layers/unified_gru_test.py
rename to tensorflow/python/keras/layers/gru_v2_test.py
index 3015b8b..8f241f4 100644
--- a/tensorflow/python/keras/layers/unified_gru_test.py
+++ b/tensorflow/python/keras/layers/gru_v2_test.py
@@ -34,6 +34,8 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import recurrent as rnn_v1
+from tensorflow.python.keras.layers import recurrent_v2 as rnn
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -54,7 +56,7 @@
 
 
 @keras_parameterized.run_all_keras_modes(config=_config)
-class UnifiedGRUTest(keras_parameterized.TestCase):
+class GRUV2Test(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('non_tan_activation', 'relu', 'sigmoid', 0, False, True, True),
@@ -67,13 +69,13 @@
   def test_could_use_defun_backend(self, activation, recurrent_activation,
                                    recurrent_dropout, unroll, use_bias,
                                    reset_after):
-    layer = keras.layers.UnifiedGRU(1,
-                                    activation=activation,
-                                    recurrent_activation=recurrent_activation,
-                                    recurrent_dropout=recurrent_dropout,
-                                    unroll=unroll,
-                                    use_bias=use_bias,
-                                    reset_after=reset_after)
+    layer = rnn.GRU(1,
+                    activation=activation,
+                    recurrent_activation=recurrent_activation,
+                    recurrent_dropout=recurrent_dropout,
+                    unroll=unroll,
+                    use_bias=use_bias,
+                    reset_after=reset_after)
     self.assertFalse(layer.could_use_cudnn)
 
   def test_keras_model_with_gru(self):
@@ -91,7 +93,7 @@
         num_classes=output_shape)
     y_train = keras.utils.to_categorical(y_train, output_shape)
 
-    layer = keras.layers.UnifiedGRU(rnn_state_size)
+    layer = rnn.GRU(rnn_state_size)
 
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
@@ -108,7 +110,7 @@
     timesteps = 3
     embedding_dim = 4
     units = 2
-    layer = keras.layers.UnifiedGRU(units, input_shape=(None, embedding_dim))
+    layer = rnn.GRU(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
     model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
@@ -121,15 +123,15 @@
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
     model = keras.models.Sequential()
-    model.add(keras.layers.UnifiedGRU(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.UnifiedGRU(5, return_sequences=True, unroll=False))
+    model.add(rnn.GRU(10, return_sequences=True, unroll=False))
+    model.add(rnn.GRU(5, return_sequences=True, unroll=False))
     model.compile(
         loss='categorical_crossentropy',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_GRU(self):
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     for stateful in (False, True):
       l1 = layer_class(units=1, stateful=stateful)
       l2 = layer_class.from_config(l1.get_config())
@@ -152,9 +154,9 @@
 
       inputs = keras.layers.Input(
           shape=[timestep, input_shape], dtype=dtypes.float32)
-      gru_layer = keras.layers.GRU(rnn_state_size,
-                                   recurrent_activation='sigmoid',
-                                   reset_after=True)
+      gru_layer = rnn_v1.GRU(rnn_state_size,
+                             recurrent_activation='sigmoid',
+                             reset_after=True)
       output = gru_layer(inputs)
       gru_model = keras.models.Model(inputs, output)
       weights = gru_model.get_weights()
@@ -164,9 +166,9 @@
       y_2 = gru_model.predict(x_train)
 
       with test_util.device(use_gpu=True):
-        cudnn_layer = keras.layers.UnifiedGRU(rnn_state_size,
-                                              recurrent_activation='sigmoid',
-                                              reset_after=True)
+        cudnn_layer = rnn.GRU(rnn_state_size,
+                              recurrent_activation='sigmoid',
+                              reset_after=True)
         cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
       cudnn_model.set_weights(weights)
       y_3 = cudnn_model.predict(x_train)
@@ -198,7 +200,7 @@
     def build_model():
       inputs = keras.layers.Input(
           shape=[timestep, input_dim], dtype=dtypes.float32)
-      layer = keras.layers.UnifiedGRU(
+      layer = rnn.GRU(
           units,
           use_bias=use_bias,
           bias_initializer=bias_initializer)
@@ -227,14 +229,14 @@
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
     with test_util.device(use_gpu=False):
-      layer = keras.layers.UnifiedGRU(rnn_state_size)
+      layer = rnn.GRU(rnn_state_size)
       output = layer(inputs)
       cpu_model = keras.models.Model(inputs, output)
       weights = cpu_model.get_weights()
       y_1 = cpu_model.predict(x_train)
 
     with test_util.device(use_gpu=True):
-      layer = keras.layers.UnifiedGRU(rnn_state_size)
+      layer = rnn.GRU(rnn_state_size)
       output = layer(inputs)
       gpu_model = keras.models.Model(inputs, output)
       gpu_model.set_weights(weights)
@@ -244,9 +246,9 @@
     # 'sigmoid' as default. Construct the canonical GRU with sigmoid to achieve
     # the same output.
     with test_util.device(use_gpu=True):
-      layer = keras.layers.GRU(rnn_state_size,
-                               recurrent_activation='sigmoid',
-                               reset_after=True)
+      layer = rnn_v1.GRU(rnn_state_size,
+                         recurrent_activation='sigmoid',
+                         reset_after=True)
       output = layer(inputs)
       canonical_model = keras.models.Model(inputs, output)
       canonical_model.set_weights(weights)
@@ -289,18 +291,18 @@
         outputs = layer(inputs)
       return keras.models.Model(inputs, outputs)
 
-    gru_model = build_model(keras.layers.GRU)
+    gru_model = build_model(rnn_v1.GRU)
     y_ref = gru_model.predict(x_train)
     weights = gru_model.get_weights()
 
-    unified_gru_model = build_model(keras.layers.UnifiedGRU)
+    unified_gru_model = build_model(rnn.GRU)
     unified_gru_model.set_weights(weights)
     y = unified_gru_model.predict(x_train)
 
     self.assertAllClose(y, y_ref)
 
   def test_with_masking_layer_GRU(self):
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
@@ -317,8 +319,8 @@
     targets /= targets.sum(axis=-1, keepdims=True)
     model = keras.models.Sequential()
     model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(keras.layers.UnifiedGRU(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.UnifiedGRU(5, return_sequences=True, unroll=False))
+    model.add(rnn.GRU(10, return_sequences=True, unroll=False))
+    model.add(rnn.GRU(5, return_sequences=True, unroll=False))
     model.compile(
         loss='categorical_crossentropy',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
@@ -330,13 +332,13 @@
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedGRU,
+        rnn.GRU,
         kwargs={'units': units,
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_return_states_GRU(self):
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     x = np.random.random((2, 3, 4))
     y = np.abs(np.random.random((2, 5)))
     s = np.abs(np.random.random((2, 5)))
@@ -356,7 +358,7 @@
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedGRU,
+        rnn.GRU,
         kwargs={'units': units,
                 'dropout': 0.1,
                 'recurrent_dropout': 0.1},
@@ -364,7 +366,7 @@
 
   def test_constraints_GRU(self):
     embedding_dim = 4
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     k_constraint = keras.constraints.max_norm(0.01)
     r_constraint = keras.constraints.max_norm(0.01)
     b_constraint = keras.constraints.max_norm(0.01)
@@ -388,14 +390,14 @@
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedGRU,
+        rnn.GRU,
         kwargs={'units': units,
                 'implementation': implementation_mode},
         input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_regularizers_GRU(self):
     embedding_dim = 4
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     layer = layer_class(
         5,
         return_sequences=False,
@@ -420,7 +422,7 @@
     timesteps = 3
     embedding_dim = 4
     units = 2
-    layer_class = keras.layers.UnifiedGRU
+    layer_class = rnn.GRU
     model = keras.models.Sequential()
     model.add(
         keras.layers.Embedding(
@@ -490,9 +492,7 @@
     model = keras.Sequential([
         keras.layers.Embedding(vocab_size, embedding_dim,
                                batch_input_shape=[batch_size, timestep]),
-        keras.layers.UnifiedGRU(units,
-                                return_sequences=True,
-                                stateful=True),
+        rnn.GRU(units, return_sequences=True, stateful=True),
         keras.layers.Dense(vocab_size)
     ])
     model.compile(optimizer='adam',
@@ -511,11 +511,11 @@
     embedding_size = 11
     gru_unit_size = 12
 
-    gru = keras.layers.UnifiedGRU(gru_unit_size,
-                                  return_sequences=True,
-                                  return_state=True,
-                                  recurrent_activation='sigmoid',
-                                  recurrent_initializer='glorot_uniform')
+    gru = rnn.GRU(gru_unit_size,
+                  return_sequences=True,
+                  return_state=True,
+                  recurrent_activation='sigmoid',
+                  recurrent_initializer='glorot_uniform')
 
     x = random_ops.random_uniform([1, time_steps, embedding_size])
     y = random_ops.random_uniform([1, gru_unit_size])
@@ -549,7 +549,7 @@
           num_classes=output_shape)
       y_train = keras.utils.to_categorical(y_train, output_shape)
 
-      layer = keras.layers.UnifiedGRU(rnn_state_size, return_runtime=True)
+      layer = rnn.GRU(rnn_state_size, return_runtime=True)
 
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
@@ -598,7 +598,7 @@
           num_classes=output_shape)
       y_train = keras.utils.to_categorical(y_train, output_shape)
 
-      layer = keras.layers.UnifiedGRU(rnn_state_size, return_runtime=True)
+      layer = rnn.GRU(rnn_state_size, return_runtime=True)
 
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/lstm_v2_test.py
similarity index 92%
rename from tensorflow/python/keras/layers/unified_lstm_test.py
rename to tensorflow/python/keras/layers/lstm_v2_test.py
index 1e94b30..5bafb56 100644
--- a/tensorflow/python/keras/layers/unified_lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_v2_test.py
@@ -35,6 +35,8 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import recurrent as rnn_v1
+from tensorflow.python.keras.layers import recurrent_v2 as rnn
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -55,7 +57,7 @@
 
 
 @keras_parameterized.run_all_keras_modes(config=_config)
-class UnifiedLSTMTest(keras_parameterized.TestCase):
+class LSTMV2Test(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('non_tan_activation', 'relu', 'sigmoid', 0, False, True),
@@ -66,7 +68,7 @@
   )
   def test_could_use_defun_backend(self, activation, recurrent_activation,
                                    recurrent_dropout, unroll, use_bias):
-    layer = keras.layers.UnifiedLSTM(
+    layer = rnn.LSTM(
         1,
         activation=activation,
         recurrent_activation=recurrent_activation,
@@ -85,7 +87,7 @@
     inputs = keras.layers.Dense(
         embedding_dim, input_shape=(timesteps, embedding_dim))
     model.add(inputs)
-    layer = keras.layers.UnifiedLSTM(units, return_sequences=True)
+    layer = rnn.LSTM(units, return_sequences=True)
     model.add(layer)
     outputs = model.layers[-1].output
     self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
@@ -95,7 +97,7 @@
     timesteps = 3
     embedding_dim = 4
     units = 2
-    layer = keras.layers.UnifiedLSTM(units, input_shape=(None, embedding_dim))
+    layer = rnn.LSTM(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
     model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
@@ -108,15 +110,15 @@
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
     model = keras.models.Sequential()
-    model.add(keras.layers.UnifiedLSTM(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.add(rnn.LSTM(10, return_sequences=True, unroll=False))
+    model.add(rnn.LSTM(5, return_sequences=True, unroll=False))
     model.compile(
         loss='categorical_crossentropy',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_LSTM(self):
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     for stateful in (False, True):
       l1 = layer_class(units=1, stateful=stateful)
       l2 = layer_class.from_config(l1.get_config())
@@ -132,7 +134,7 @@
     # Test with Keras tensor
     inputs = keras.Input((timesteps, embedding_dim))
     initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    layer = keras.layers.UnifiedLSTM(units)
+    layer = rnn.LSTM(units)
     if len(initial_state) == 1:
       output = layer(inputs, initial_state=initial_state[0])
     else:
@@ -164,7 +166,7 @@
         keras.backend.random_normal_variable((num_samples, units), 0, 1)
         for _ in range(num_states)
     ]
-    layer = keras.layers.UnifiedLSTM(units)
+    layer = rnn.LSTM(units)
     output = layer(inputs, initial_state=initial_state)
 
     model = keras.models.Model(inputs, output)
@@ -183,7 +185,7 @@
     units = 3
     num_samples = 2
 
-    layer = keras.layers.UnifiedLSTM(units, stateful=True)
+    layer = rnn.LSTM(units, stateful=True)
     layer.build((num_samples, timesteps, embedding_dim))
     initial_weight_count = len(layer.weights)
     layer.reset_states()
@@ -223,7 +225,7 @@
     inputs = keras.Input((timesteps, embedding_dim))
     _ = keras.layers.Masking()(inputs)
     initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    output = keras.layers.UnifiedLSTM(units)(
+    output = rnn.LSTM(units)(
         inputs, initial_state=initial_state)
 
     model = keras.models.Model([inputs] + initial_state, output)
@@ -247,7 +249,7 @@
 
     inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
     masked = keras.layers.Masking()(inputs)
-    layer = keras.layers.UnifiedLSTM(units, return_state=True, stateful=True)
+    layer = rnn.LSTM(units, return_state=True, stateful=True)
     outputs = layer(masked)
     state = outputs[1:]
     assert len(state) == num_states
@@ -264,11 +266,11 @@
     num_samples = 2
 
     inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = keras.layers.UnifiedLSTM(
+    layer = rnn.LSTM(
         units, return_state=True, return_sequences=True)
     outputs = layer(inputs)
     output, state = outputs[0], outputs[1:]
-    output = keras.layers.UnifiedLSTM(units)(output, initial_state=state)
+    output = rnn.LSTM(units)(output, initial_state=state)
     model = keras.models.Model(inputs, output)
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
@@ -280,7 +282,7 @@
     units = 3
     num_samples = 2
     num_states = 2
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
 
     # Test with Keras tensor
     main_inputs = keras.Input((timesteps, embedding_dim))
@@ -320,8 +322,8 @@
 
       inputs = keras.layers.Input(
           shape=[timestep, input_shape], dtype=dtypes.float32)
-      lstm_layer = keras.layers.LSTM(rnn_state_size,
-                                     recurrent_activation='sigmoid')
+      lstm_layer = rnn_v1.LSTM(rnn_state_size,
+                               recurrent_activation='sigmoid')
       output = lstm_layer(inputs)
       lstm_model = keras.models.Model(inputs, output)
       weights = lstm_model.get_weights()
@@ -331,7 +333,7 @@
       y_2 = lstm_model.predict(x_train)
 
       with test_util.device(use_gpu=True):
-        cudnn_layer = keras.layers.UnifiedLSTM(rnn_state_size)
+        cudnn_layer = rnn.LSTM(rnn_state_size)
         cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
       cudnn_model.set_weights(weights)
       y_3 = cudnn_model.predict(x_train)
@@ -343,20 +345,20 @@
       self.assertAllClose(y_2, y_4, rtol=1e-5, atol=1e-5)
 
   @parameterized.named_parameters(('v0', 0), ('v1', 1), ('v2', 2))
-  def test_implementation_mode_LSTM(self, implementation_mode):
+  def DISABLED_test_implementation_mode_LSTM(self, implementation_mode):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedLSTM,
+        rnn.LSTM,
         kwargs={
             'units': units,
             'implementation': implementation_mode
         },
         input_shape=(num_samples, timesteps, embedding_dim))
 
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     k_constraint = keras.constraints.max_norm(0.01)
     r_constraint = keras.constraints.max_norm(0.01)
     b_constraint = keras.constraints.max_norm(0.01)
@@ -373,7 +375,7 @@
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
@@ -391,8 +393,8 @@
     targets /= targets.sum(axis=-1, keepdims=True)
     model = keras.models.Sequential()
     model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(keras.layers.UnifiedLSTM(10, return_sequences=True, unroll=False))
-    model.add(keras.layers.UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.add(rnn.LSTM(10, return_sequences=True, unroll=False))
+    model.add(rnn.LSTM(5, return_sequences=True, unroll=False))
     model.compile(
         loss='categorical_crossentropy',
         optimizer=gradient_descent.GradientDescentOptimizer(0.01))
@@ -431,11 +433,11 @@
         outputs = layer(inputs)
       return keras.models.Model(inputs, outputs)
 
-    lstm_model = build_model(keras.layers.LSTM)
+    lstm_model = build_model(rnn_v1.LSTM)
     y_ref = lstm_model.predict(x_train)
     weights = lstm_model.get_weights()
 
-    unified_lstm_model = build_model(keras.layers.UnifiedLSTM)
+    unified_lstm_model = build_model(rnn.LSTM)
     unified_lstm_model.set_weights(weights)
     y = unified_lstm_model.predict(x_train)
 
@@ -455,7 +457,7 @@
         num_classes=output_shape)
     y_train = keras.utils.to_categorical(y_train, output_shape)
 
-    layer = keras.layers.UnifiedLSTM(rnn_state_size)
+    layer = rnn.LSTM(rnn_state_size)
 
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
@@ -488,7 +490,7 @@
     def build_model():
       inputs = keras.layers.Input(
           shape=[timestep, input_dim], dtype=dtypes.float32)
-      layer = keras.layers.UnifiedLSTM(
+      layer = rnn.LSTM(
           units,
           use_bias=use_bias,
           bias_initializer=bias_initializer)
@@ -517,14 +519,14 @@
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
     with test_util.device(use_gpu=False):
-      layer = keras.layers.UnifiedLSTM(rnn_state_size)
+      layer = rnn.LSTM(rnn_state_size)
       output = layer(inputs)
       cpu_model = keras.models.Model(inputs, output)
       weights = cpu_model.get_weights()
     y_1 = cpu_model.predict(x_train)
 
     with test_util.device(use_gpu=True):
-      layer = keras.layers.UnifiedLSTM(rnn_state_size)
+      layer = rnn.LSTM(rnn_state_size)
       output = layer(inputs)
       gpu_model = keras.models.Model(inputs, output)
       gpu_model.set_weights(weights)
@@ -534,7 +536,7 @@
     # 'sigmoid' as default. Construct the canonical LSTM with sigmoid to achieve
     # the same output.
     with test_util.device(use_gpu=True):
-      layer = keras.layers.LSTM(rnn_state_size, recurrent_activation='sigmoid')
+      layer = rnn_v1.LSTM(rnn_state_size, recurrent_activation='sigmoid')
       output = layer(inputs)
       canonical_model = keras.models.Model(inputs, output)
       # Remove the extra cudnn bias since canonical lstm will not use it.
@@ -544,13 +546,13 @@
     self.assertAllClose(y_1, y_2)
     self.assertAllClose(y_2, y_3)
 
-  def test_return_sequences_LSTM(self):
+  def DISABLED_test_return_sequences_LSTM(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedLSTM,
+        rnn.LSTM,
         kwargs={
             'units': units,
             'return_sequences': True
@@ -559,7 +561,7 @@
 
   def test_regularizers_LSTM(self):
     embedding_dim = 4
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     layer = layer_class(
         5,
         return_sequences=False,
@@ -583,7 +585,7 @@
     timesteps = 3
     embedding_dim = 4
     units = 2
-    layer_class = keras.layers.UnifiedLSTM
+    layer_class = rnn.LSTM
     model = keras.models.Sequential()
     model.add(
         keras.layers.Embedding(
@@ -653,9 +655,7 @@
     model = keras.Sequential([
         keras.layers.Embedding(vocab_size, embedding_dim,
                                batch_input_shape=[batch_size, timestep]),
-        keras.layers.UnifiedLSTM(units,
-                                 return_sequences=True,
-                                 stateful=True),
+        rnn.LSTM(units, return_sequences=True, stateful=True),
         keras.layers.Dense(vocab_size)
     ])
     model.compile(optimizer='adam',
@@ -669,7 +669,7 @@
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        keras.layers.UnifiedLSTM,
+        rnn.LSTM,
         kwargs={
             'units': units,
             'dropout': 0.1,
@@ -698,7 +698,7 @@
           num_classes=output_shape)
       y_train = keras.utils.to_categorical(y_train, output_shape)
 
-      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
+      layer = rnn.LSTM(rnn_state_size, return_runtime=True)
 
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
@@ -747,7 +747,7 @@
           num_classes=output_shape)
       y_train = keras.utils.to_categorical(y_train, output_shape)
 
-      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
+      layer = rnn.LSTM(rnn_state_size, return_runtime=True)
 
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
@@ -827,7 +827,7 @@
     rnn_state_size = test_config['rnn_state_size']
     timestep = test_config['timestep']
 
-    layer = keras.layers.UnifiedLSTM(rnn_state_size)
+    layer = rnn.LSTM(rnn_state_size)
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
 
@@ -848,7 +848,7 @@
     rnn_state_size = test_config['rnn_state_size']
     timestep = test_config['timestep']
 
-    layer = keras.layers.LSTM(rnn_state_size)
+    layer = rnn_v1.LSTM(rnn_state_size)
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
 
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index a94ca95..b32a8cd 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -20,15 +20,10 @@
 from __future__ import print_function
 
 import collections
-import uuid
 
 import numpy as np
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
@@ -40,7 +35,6 @@
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
@@ -48,14 +42,6 @@
 from tensorflow.python.util.tf_export import keras_export
 
 
-# The following string constants are used by Defun approach for unified backend
-# of LSTM and GRU.
-_DEFUN_API_NAME_ATTRIBUTE = 'api_implements'
-_DEFUN_DEVICE_ATTRIBUTE = 'api_preferred_device'
-_CPU_DEVICE_NAME = 'CPU'
-_GPU_DEVICE_NAME = 'GPU'
-
-
 @keras_export('keras.layers.StackedRNNCells')
 class StackedRNNCells(Layer):
   """Wrapper allowing a stack of RNN cells to behave as a single cell.
@@ -193,7 +179,6 @@
     return cls(cells, **config)
 
 
-
 @keras_export('keras.layers.RNN')
 class RNN(Layer):
   """Base class for recurrent layers.
@@ -250,17 +235,6 @@
       Unrolling can speed-up a RNN,
       although it tends to be more memory-intensive.
       Unrolling is only suitable for short sequences.
-    input_dim: dimensionality of the input (integer or tuple of integers).
-      This argument (or alternatively, the keyword argument `input_shape`)
-      is required when using this layer as the first layer in a model.
-    input_length: Length of input sequences, to be specified
-      when it is constant.
-      This argument is required if you are going to connect
-      `Flatten` then `Dense` layers upstream
-      (without it, the shape of the dense outputs cannot be computed).
-      Note that if the recurrent layer is not the first layer
-      in your model, you would need to specify the input length
-      at the level of the first layer (e.g. via the `input_shape` argument).
     time_major: The shape format of the `inputs` and `outputs` tensors.
         If True, the inputs and outputs will be in shape
         `(timesteps, batch, ...)`, whereas in the False case, it will be
@@ -410,6 +384,13 @@
     # If True, the output for masked timestep will be zeros, whereas in the
     # False case, output from previous timestep is returned for masked timestep.
     self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
+
+    if 'input_shape' not in kwargs and (
+        'input_dim' in kwargs or 'input_length' in kwargs):
+      input_shape = (kwargs.pop('input_length', None),
+                     kwargs.pop('input_dim', None))
+      kwargs['input_shape'] = input_shape
+
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
     self.return_sequences = return_sequences
@@ -2068,392 +2049,6 @@
     return cls(**config)
 
 
-@keras_export('keras.layers.GRU', v1=[])
-class UnifiedGRU(DropoutRNNCellMixin, GRU):
-  """Gated Recurrent Unit - Cho et al. 2014.
-
-  Based on available runtime hardware and constraints, this layer
-  will choose different implementations (cuDNN-based or pure-TensorFlow)
-  to maximize the performance. If a GPU is available and all
-  the arguments to the layer meet the requirement of the CuDNN kernel
-  (see below for details), the layer will use a fast cuDNN implementation.
-
-  The requirements to use the cuDNN implementation are:
-
-  1. `activation` == 'tanh'
-  2. `recurrent_activation` == 'sigmoid'
-  3. `recurrent_dropout` == 0
-  4. `unroll` is False
-  5. `use_bias` is True
-  6. `reset_after` is True
-  7. No use of masking.
-
-  There are two variants of the GRU implementation. The default one is based on
-  [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
-  state before matrix multiplication. The other one is based on
-  [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.
-
-  The second variant is compatible with CuDNNGRU (GPU-only) and allows
-  inference on CPU. Thus it has separate biases for `kernel` and
-  `recurrent_kernel`. To use this variant, set `'reset_after'=True` and
-  `recurrent_activation='sigmoid'`.
-
-  Arguments:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use
-      for the recurrent step.
-      Default: sigmoid (`sigmoid`).
-      If you pass `None`, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix,
-      used for the linear transformation of the inputs.
-    recurrent_initializer: Initializer for the `recurrent_kernel`
-       weights matrix,
-       used for the linear transformation of the recurrent state.
-    bias_initializer: Initializer for the bias vector.
-    kernel_regularizer: Regularizer function applied to
-      the `kernel` weights matrix.
-    recurrent_regularizer: Regularizer function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to
-      the output of the layer (its "activation")..
-    kernel_constraint: Constraint function applied to
-      the `kernel` weights matrix.
-    recurrent_constraint: Constraint function applied to
-      the `recurrent_kernel` weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    dropout: Float between 0 and 1.
-      Fraction of the units to drop for the linear transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1.
-      Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications.
-    return_sequences: Boolean. Whether to return the last output
-      in the output sequence, or the full sequence.
-    return_state: Boolean. Whether to return the last state
-      in addition to the output.
-    go_backwards: Boolean (default False).
-      If True, process the input sequence backwards and return the
-      reversed sequence.
-    stateful: Boolean (default False). If True, the last state
-      for each sample at index i in a batch will be used as initial
-      state for the sample of index i in the following batch.
-    unroll: Boolean (default False).
-      If True, the network will be unrolled,
-      else a symbolic loop will be used.
-      Unrolling can speed-up a RNN,
-      although it tends to be more memory-intensive.
-      Unrolling is only suitable for short sequences.
-    reset_after: GRU convention (whether to apply reset gate after or
-      before matrix multiplication). False = "before",
-      True = "after" (default and CuDNN compatible).
-
-  Call arguments:
-    inputs: A 3D tensor.
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
-      a given timestep should be masked.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or
-      `recurrent_dropout` is used.
-    initial_state: List of initial state tensors to be passed to the first
-      call of the cell.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               implementation=1,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               unroll=False,
-               time_major=False,
-               reset_after=True,
-               **kwargs):
-    # return_runtime is a flag for testing, which shows the real backend
-    # implementation chosen by grappler in graph mode.
-    self._return_runtime = kwargs.pop('return_runtime', False)
-
-    super(UnifiedGRU, self).__init__(
-        units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        unroll=unroll,
-        time_major=time_major,
-        reset_after=reset_after,
-        **kwargs)
-    # CuDNN uses following setting by default and not configurable.
-    self.could_use_cudnn = (
-        activation == 'tanh' and recurrent_activation == 'sigmoid' and
-        recurrent_dropout == 0 and not unroll and use_bias and
-        reset_after is True)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    # GRU does not support constants. Ignore it during process.
-    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
-
-    if isinstance(mask, list):
-      mask = mask[0]
-
-    input_shape = K.int_shape(inputs)
-    timesteps = input_shape[0] if self.time_major else input_shape[1]
-
-    if mask is not None or not self.could_use_cudnn:
-      # CuDNN does not support masking, fall back to use the normal GRU.
-      kwargs = {'training': training}
-
-      def step(cell_inputs, cell_states):
-        return self.cell.call(cell_inputs, cell_states, **kwargs)
-
-      last_output, outputs, states = K.rnn(
-          step,
-          inputs,
-          initial_state,
-          constants=None,
-          go_backwards=self.go_backwards,
-          mask=mask,
-          unroll=self.unroll,
-          input_length=timesteps,
-          time_major=self.time_major,
-          zero_output_for_mask=self.zero_output_for_mask)
-      # This is a dummy tensor for testing purpose.
-      runtime = _runtime('unknown')
-    else:
-      last_output, outputs, runtime, states = self._defun_gru_call(
-          inputs, initial_state, training)
-
-    if self.stateful:
-      updates = [state_ops.assign(self.states[0], states[0])]
-      self.add_update(updates, inputs)
-
-    if self.return_sequences:
-      output = outputs
-    else:
-      output = last_output
-
-    if self.return_state:
-      return [output] + list(states)
-    elif self._return_runtime:
-      return output, runtime
-    else:
-      return output
-
-  def _defun_gru_call(self, inputs, initial_state, training):
-    # Use the new defun approach for backend implementation swap.
-    # Note that different implementations need to have same function
-    # signature, eg, the tensor parameters need to have same shape and dtypes.
-    if self.go_backwards:
-      # Reverse time axis.
-      inputs = K.reverse(inputs, 0 if self.time_major else 1)
-
-    self.reset_dropout_mask()
-    dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
-    if dropout_mask is not None:
-      inputs *= dropout_mask[0]
-    if ops.executing_eagerly_outside_functions():
-      # Under eager context, the device placement is already known. Prefer the
-      # GPU implementation when GPU is available.
-      if context.num_gpus() > 0:
-        last_output, outputs, new_h, runtime = cudnn_gru(
-            inputs=inputs,
-            init_h=initial_state[0],
-            kernel=self.cell.kernel,
-            recurrent_kernel=self.cell.recurrent_kernel,
-            bias=self.cell.bias,
-            time_major=self.time_major)
-      else:
-        last_output, outputs, new_h, runtime = standard_gru(
-            inputs=inputs,
-            init_h=initial_state[0],
-            kernel=self.cell.kernel,
-            recurrent_kernel=self.cell.recurrent_kernel,
-            bias=self.cell.bias,
-            activation=self.activation,
-            recurrent_activation=self.recurrent_activation,
-            time_major=self.time_major)
-    else:
-      api_name = 'gru_' + str(uuid.uuid4())
-      defun_standard_gru = _generate_defun_backend(
-          api_name, _CPU_DEVICE_NAME, standard_gru)
-      defun_cudnn_gru = _generate_defun_backend(
-          api_name, _GPU_DEVICE_NAME, cudnn_gru)
-      # Call the normal GRU impl and register the CuDNN impl function. The
-      # grappler will kick in during session execution to optimize the graph.
-      last_output, outputs, new_h, runtime = defun_standard_gru(
-          inputs=inputs,
-          init_h=initial_state[0],
-          kernel=self.cell.kernel,
-          recurrent_kernel=self.cell.recurrent_kernel,
-          bias=self.cell.bias,
-          activation=self.activation,
-          recurrent_activation=self.recurrent_activation,
-          time_major=self.time_major)
-
-      function.register(defun_cudnn_gru, inputs, initial_state[0],
-                        self.cell.kernel, self.cell.recurrent_kernel,
-                        self.cell.bias, self.time_major)
-    states = [new_h]
-    return last_output, outputs, runtime, states
-
-
-def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
-                 recurrent_activation, time_major):
-  """GRU with standard kernel implementation.
-
-  This implementation can be run on all types of hardware.
-
-  This implementation lifts out all the layer weights and make them function
-  parameters. It has same number of tensor input params as the CuDNN
-  counterpart. The RNN step logic has been simplified, eg dropout and mask is
-  removed since CuDNN implementation does not support that.
-
-  Arguments:
-    inputs: input tensor of GRU layer.
-    init_h: initial state tensor for the cell output.
-    kernel: weights for cell kernel.
-    recurrent_kernel: weights for cell recurrent kernel.
-    bias: weights for cell kernel bias and recurrent bias. The bias contains the
-      combined input_bias and recurrent_bias.
-    activation: Activation function to use for output.
-    recurrent_activation: Activation function to use for hidden recurrent state.
-    time_major: boolean, whether the inputs are in the format of
-      [time, batch, feature] or [batch, time, feature].
-
-  Returns:
-    last_output: output tensor for the last timestep, which has shape
-      [batch, units].
-    outputs: output tensor for all timesteps, which has shape
-      [batch, time, units].
-    state_0: the cell output, which has same shape as init_h.
-    runtime: constant string tensor which indicate real runtime hardware. This
-      value is for testing purpose and should be used by user.
-  """
-  input_shape = K.int_shape(inputs)
-  timesteps = input_shape[0] if time_major else input_shape[1]
-
-  input_bias, recurrent_bias = array_ops.unstack(bias)
-
-  def step(cell_inputs, cell_states):
-    """Step function that will be used by Keras RNN backend."""
-    h_tm1 = cell_states[0]
-
-    # inputs projected by all gate matrices at once
-    matrix_x = K.dot(cell_inputs, kernel)
-    matrix_x = K.bias_add(matrix_x, input_bias)
-
-    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)
-
-    # hidden state projected by all gate matrices at once
-    matrix_inner = K.dot(h_tm1, recurrent_kernel)
-    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
-
-    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
-                                                            axis=1)
-    z = recurrent_activation(x_z + recurrent_z)
-    r = recurrent_activation(x_r + recurrent_r)
-    hh = activation(x_h + r * recurrent_h)
-
-    # previous and candidate state mixed by update gate
-    h = z * h_tm1 + (1 - z) * hh
-    return h, [h]
-
-  last_output, outputs, new_states = K.rnn(
-      step,
-      inputs, [init_h],
-      constants=None,
-      unroll=False,
-      time_major=time_major,
-      input_length=timesteps)
-  return last_output, outputs, new_states[0], _runtime('cpu')
-
-
-def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, time_major):
-  """GRU with CuDNN implementation which is only available for GPU."""
-  if not time_major:
-    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
-  init_h = array_ops.expand_dims(init_h, axis=0)
-
-  weights = array_ops.split(kernel, 3, axis=1)
-  weights += array_ops.split(recurrent_kernel, 3, axis=1)
-  # Note that the bias was initialized as shape (2, 3 * units), flat it into
-  # (6 * units)
-  bias = array_ops.split(K.flatten(bias), 6)
-  # Note that the gate order for CuDNN is different from the canonical format.
-  # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need to
-  # be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
-  # z is update gate weights.
-  # r is reset gate weights.
-  # h is output gate weights.
-  weights[0], weights[1] = weights[1], weights[0]
-  weights[3], weights[4] = weights[4], weights[3]
-  bias[0], bias[1] = bias[1], bias[0]
-  bias[3], bias[4] = bias[4], bias[3]
-
-  params = _canonical_to_params(
-      weights=weights,
-      biases=bias,
-      shape=constant_op.constant([-1]),
-      transpose_weights=True)
-
-  outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-      inputs,
-      input_h=init_h,
-      input_c=0,
-      params=params,
-      is_training=True,
-      rnn_mode='gru')
-  last_output = outputs[-1]
-  if not time_major:
-    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
-  h = h[0]
-  return last_output, outputs, h, _runtime('cudnn')
-
-
 @keras_export('keras.layers.LSTMCell')
 class LSTMCell(DropoutRNNCellMixin, Layer):
   """Cell class for the LSTM layer.
@@ -3070,386 +2665,6 @@
     return cls(**config)
 
 
-@keras_export('keras.layers.LSTM', v1=[])
-class UnifiedLSTM(DropoutRNNCellMixin, LSTM):
-  """Long Short-Term Memory layer - Hochreiter 1997.
-
-  Based on available runtime hardware and constraints, this layer
-  will choose different implementations (cuDNN-based or pure-TensorFlow)
-  to maximize the performance. If a GPU is available and all
-  the arguments to the layer meet the requirement of the CuDNN kernel
-  (see below for details), the layer will use a fast cuDNN implementation.
-
-  The requirements to use the cuDNN implementation are:
-
-  1. `activation` == 'tanh'
-  2. `recurrent_activation` == 'sigmoid'
-  3. `recurrent_dropout` == 0
-  4. `unroll` is False
-  5. `use_bias` is True
-  7. No use of masking.
-
-  Arguments:
-    units: Positive integer, dimensionality of the output space.
-    activation: Activation function to use.
-      Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
-      is applied (ie. "linear" activation: `a(x) = x`).
-    recurrent_activation: Activation function to use for the recurrent step.
-      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
-      applied (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix, used for
-      the linear transformation of the inputs..
-    recurrent_initializer: Initializer for the `recurrent_kernel` weights
-      matrix, used for the linear transformation of the recurrent state..
-    bias_initializer: Initializer for the bias vector.
-    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
-      initialization. Setting it to true will also force
-      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
-          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    recurrent_regularizer: Regularizer function applied to the
-      `recurrent_kernel` weights matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation")..
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
-      weights matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
-      transformation of the inputs.
-    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
-      the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
-      its operations as a larger number of smaller dot products and additions,
-      whereas mode 2 will batch them into fewer, larger operations. These modes
-      will have different performance profiles on different hardware and for
-      different applications.
-    return_sequences: Boolean. Whether to return the last output. in the output
-      sequence, or the full sequence.
-    return_state: Boolean. Whether to return the last state in addition to the
-      output.
-    go_backwards: Boolean (default False). If True, process the input sequence
-      backwards and return the reversed sequence.
-    stateful: Boolean (default False). If True, the last state for each sample
-      at index i in a batch will be used as initial state for the sample of
-      index i in the following batch.
-    unroll: Boolean (default False). If True, the network will be unrolled, else
-      a symbolic loop will be used. Unrolling can speed-up a RNN, although it
-      tends to be more memory-intensive. Unrolling is only suitable for short
-      sequences.
-
-  Call arguments:
-    inputs: A 3D tensor.
-    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
-      a given timestep should be masked.
-    training: Python boolean indicating whether the layer should behave in
-      training mode or in inference mode. This argument is passed to the cell
-      when calling it. This is only relevant if `dropout` or
-      `recurrent_dropout` is used.
-    initial_state: List of initial state tensors to be passed to the first
-      call of the cell.
-  """
-
-  def __init__(self,
-               units,
-               activation='tanh',
-               recurrent_activation='sigmoid',
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               recurrent_initializer='orthogonal',
-               bias_initializer='zeros',
-               unit_forget_bias=True,
-               kernel_regularizer=None,
-               recurrent_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               recurrent_constraint=None,
-               bias_constraint=None,
-               dropout=0.,
-               recurrent_dropout=0.,
-               implementation=1,
-               return_sequences=False,
-               return_state=False,
-               go_backwards=False,
-               stateful=False,
-               time_major=False,
-               unroll=False,
-               **kwargs):
-    # return_runtime is a flag for testing, which shows the real backend
-    # implementation chosen by grappler in graph mode.
-    self.return_runtime = kwargs.pop('return_runtime', False)
-
-    super(UnifiedLSTM, self).__init__(
-        units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        unit_forget_bias=unit_forget_bias,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
-        return_sequences=return_sequences,
-        return_state=return_state,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        time_major=time_major,
-        unroll=unroll,
-        **kwargs)
-
-    self.state_spec = [
-        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
-    ]
-    self.could_use_cudnn = (
-        activation == 'tanh' and recurrent_activation == 'sigmoid' and
-        recurrent_dropout == 0 and not unroll and use_bias)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
-    # LSTM does not support constants. Ignore it during process.
-    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
-
-    if isinstance(mask, list):
-      mask = mask[0]
-
-    input_shape = K.int_shape(inputs)
-    timesteps = input_shape[0] if self.time_major else input_shape[1]
-
-    if mask is not None or not self.could_use_cudnn:
-      # CuDNN does not support masking, fall back to use the normal LSTM.
-      kwargs = {'training': training}
-
-      def step(inputs, states):
-        return self.cell.call(inputs, states, **kwargs)
-
-      last_output, outputs, states = K.rnn(
-          step,
-          inputs,
-          initial_state,
-          constants=None,
-          go_backwards=self.go_backwards,
-          mask=mask,
-          unroll=self.unroll,
-          input_length=timesteps,
-          time_major=self.time_major,
-          zero_output_for_mask=self.zero_output_for_mask)
-      runtime = _runtime('unknown')
-    else:
-      # Use the new defun approach for backend implementation swap.
-      # Note that different implementations need to have same function
-      # signature, eg, the tensor parameters need to have same shape and dtypes.
-      # Since the CuDNN has an extra set of bias, those bias will be passed to
-      # both normal and CuDNN implementations.
-      if self.go_backwards:
-        # Reverse time axis.
-        inputs = K.reverse(inputs, 0 if self.time_major else 1)
-
-      self.reset_dropout_mask()
-      dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
-      if dropout_mask is not None:
-        inputs *= dropout_mask[0]
-
-      if ops.executing_eagerly_outside_functions():
-        # Under eager context, the device placement is already known. Prefer the
-        # GPU implementation here.
-        if context.num_gpus() > 0:
-          last_output, outputs, new_h, new_c, runtime = cudnn_lstm(
-              inputs, initial_state[0], initial_state[1], self.cell.kernel,
-              self.cell.recurrent_kernel, self.cell.bias, self.time_major)
-        else:
-          last_output, outputs, new_h, new_c, runtime = standard_lstm(
-              inputs, initial_state[0], initial_state[1], self.cell.kernel,
-              self.cell.recurrent_kernel, self.cell.bias, self.activation,
-              self.recurrent_activation, self.time_major)
-      else:
-        # Each time a `tf.function` is called, we will give it a unique
-        # identifiable API name, so that Grappler won't get confused when it
-        # sees multiple LSTM layers added into same graph, and it will be able
-        # to pair up the different implementations across them.
-        api_name = 'lstm_' + str(uuid.uuid4())
-        defun_standard_lstm = _generate_defun_backend(
-            api_name, _CPU_DEVICE_NAME, standard_lstm)
-        defun_cudnn_lstm = _generate_defun_backend(
-            api_name, _GPU_DEVICE_NAME, cudnn_lstm)
-
-        # Call the normal LSTM impl and register the CuDNN impl function. The
-        # grappler will kick in during session execution to optimize the graph.
-        last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
-            inputs, initial_state[0], initial_state[1], self.cell.kernel,
-            self.cell.recurrent_kernel, self.cell.bias, self.activation,
-            self.recurrent_activation, self.time_major)
-
-        function.register(defun_cudnn_lstm, inputs, initial_state[0],
-                          initial_state[1], self.cell.kernel,
-                          self.cell.recurrent_kernel, self.cell.bias,
-                          self.time_major)
-      states = [new_h, new_c]
-
-    if self.stateful:
-      updates = []
-      for i in range(len(states)):
-        updates.append(state_ops.assign(self.states[i], states[i]))
-      self.add_update(updates, inputs)
-
-    if self.return_sequences:
-      output = outputs
-    else:
-      output = last_output
-
-    if self.return_state:
-      return [output] + list(states)
-    elif self.return_runtime:
-      return output, runtime
-    else:
-      return output
-
-
-def _canonical_to_params(weights, biases, shape, transpose_weights=False):
-  """Utility function convert variable to CuDNN compatible parameter.
-
-  Note that Keras weights for kernels are different from the CuDNN format. Eg.:
-
-  ```
-    Keras                 CuDNN
-    [[0, 1, 2],  <--->  [[0, 2, 4],
-     [3, 4, 5]]          [1, 3, 5]]
-  ```
-
-  If the input weights need to be in a unified format, then set
-  `transpose_weights=True` to convert the weights.
-
-  Args:
-    weights: list of weights for the individual kernels and recurrent kernels.
-    biases: list of biases for individual gate.
-    shape: the shape for the converted variables that will be feed to CuDNN.
-    transpose_weights: boolean, whether to transpose the weights.
-
-  Returns:
-    The converted weights that can be feed to CuDNN ops as param.
-  """
-  def convert(w):
-    return array_ops.transpose(w) if transpose_weights else w
-
-  weights = [array_ops.reshape(convert(x), shape) for x in weights]
-  biases = [array_ops.reshape(x, shape) for x in biases]
-  return array_ops.concat(weights + biases, axis=0)
-
-
-def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
-                  activation, recurrent_activation, time_major):
-  """LSTM with standard kernel implementation.
-
-  This implementation can be run on all types for hardware.
-
-  This implementation lifts out all the layer weights and make them function
-  parameters. It has same number of tensor input params as the CuDNN
-  counterpart. The RNN step logic has been simplified, eg dropout and mask is
-  removed since CuDNN implementation does not support that.
-
-  Note that the first half of the bias tensor should be ignored by this impl.
-  The CuDNN impl need an extra set of input gate bias. In order to make the both
-  function take same shape of parameter, that extra set of bias is also feed
-  here.
-
-  Args:
-    inputs: input tensor of LSTM layer.
-    init_h: initial state tensor for the cell output.
-    init_c: initial state tensor for the cell hidden state.
-    kernel: weights for cell kernel.
-    recurrent_kernel: weights for cell recurrent kernel.
-    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
-      is used in this case.
-    activation: Activation function to use for output.
-    recurrent_activation: Activation function to use for hidden recurrent state.
-    time_major: boolean, whether the inputs are in the format of
-      [time, batch, feature] or [batch, time, feature].
-
-  Returns:
-    last_output: output tensor for the last timestep, which has shape
-      [batch, units].
-    outputs: output tensor for all timesteps, which has shape
-      [batch, time, units].
-    state_0: the cell output, which has same shape as init_h.
-    state_1: the cell hidden state, which has same shape as init_c.
-    runtime: constant string tensor which indicate real runtime hardware. This
-      value is for testing purpose and should be used by user.
-  """
-  input_shape = K.int_shape(inputs)
-  timesteps = input_shape[0] if time_major else input_shape[1]
-
-  def step(cell_inputs, cell_states):
-    """Step function that will be used by Keras RNN backend."""
-    h_tm1 = cell_states[0]  # previous memory state
-    c_tm1 = cell_states[1]  # previous carry state
-
-    z = K.dot(cell_inputs, kernel)
-    z += K.dot(h_tm1, recurrent_kernel)
-    z = K.bias_add(z, bias)
-
-    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
-
-    i = recurrent_activation(z0)
-    f = recurrent_activation(z1)
-    c = f * c_tm1 + i * activation(z2)
-    o = recurrent_activation(z3)
-
-    h = o * activation(c)
-    return h, [h, c]
-
-  last_output, outputs, new_states = K.rnn(
-      step,
-      inputs, [init_h, init_c],
-      constants=None,
-      unroll=False,
-      time_major=time_major,
-      input_length=timesteps)
-  return last_output, outputs, new_states[0], new_states[1], _runtime('cpu')
-
-
-def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
-               time_major):
-  """LSTM with CuDNN implementation which is only available for GPU."""
-  if not time_major:
-    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
-  input_h = array_ops.expand_dims(input_h, axis=0)
-  input_c = array_ops.expand_dims(input_c, axis=0)
-
-  weights = array_ops.split(kernel, 4, axis=1)
-  weights += array_ops.split(recurrent_kernel, 4, axis=1)
-  # CuDNN has an extra set of bias for inputs, we disable them (setting to 0),
-  # so that mathematically it is same as the canonical LSTM implementation.
-  full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
-
-  params = _canonical_to_params(
-      weights=weights,
-      biases=array_ops.split(full_bias, 8),
-      shape=constant_op.constant([-1]),
-      transpose_weights=True)
-
-  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-      inputs, input_h=input_h, input_c=input_c, params=params, is_training=True)
-  last_output = outputs[-1]
-  if not time_major:
-    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
-  h = h[0]
-  c = c[0]
-
-  return last_output, outputs, h, c, _runtime('cudnn')
-
-
 def _generate_dropout_mask(ones, rate, training=None, count=1):
   def dropped_inputs():
     return K.dropout(ones, rate)
@@ -3557,19 +2772,3 @@
     return nest.map_structure(create_zeros, state_size)
   else:
     return create_zeros(state_size)
-
-
-def _generate_defun_backend(unique_api_name, preferred_device, func):
-  function_attributes = {
-      _DEFUN_API_NAME_ATTRIBUTE: unique_api_name,
-      _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
-  }
-  return function.defun_with_attributes(func=func,
-                                        attributes=function_attributes)
-
-
-def _runtime(runtime_name):
-  with ops.device('/cpu:0'):
-    return constant_op.constant(
-        runtime_name, dtype=dtypes.string, name='runtime')
-
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index 537e3a5..41bbdac 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -35,6 +35,8 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import recurrent as rnn_v1
+from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -730,8 +732,8 @@
 
   @parameterized.named_parameters(
       *test_util.generate_combinations_with_testcase_name(
-          layer=[keras.layers.SimpleRNN, keras.layers.GRU, keras.layers.LSTM,
-                 keras.layers.UnifiedGRU, keras.layers.UnifiedLSTM],
+          layer=[rnn_v1.SimpleRNN, rnn_v1.GRU, rnn_v1.LSTM,
+                 rnn_v2.GRU, rnn_v2.LSTM],
           unroll=[True, False]))
   def test_rnn_dropout(self, layer, unroll):
     rnn_layer = layer(3, dropout=0.1, recurrent_dropout=0.1, unroll=unroll)
@@ -1326,6 +1328,16 @@
       custom_rnn = keras.layers.RNN(cell, stateful=True)
       custom_rnn.reset_states()
 
+  def test_input_dim_length(self):
+    simple_rnn = keras.layers.SimpleRNN(5, input_length=10, input_dim=8)
+    self.assertEqual(simple_rnn._batch_input_shape, (None, 10, 8))
+
+    simple_rnn = keras.layers.SimpleRNN(5, input_dim=8)
+    self.assertEqual(simple_rnn._batch_input_shape, (None, None, 8))
+
+    simple_rnn = keras.layers.SimpleRNN(5, input_length=10)
+    self.assertEqual(simple_rnn._batch_input_shape, (None, 10, None))
+
 
 class Minimal2DRNNCell(keras.layers.Layer):
   """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
new file mode 100644
index 0000000..6bafa88
--- /dev/null
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -0,0 +1,823 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent layers for TF 2.0.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import uuid
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine.input_spec import InputSpec
+from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.util.tf_export import keras_export
+
+
+# The following string constants are used by Defun approach for unified backend
+# of LSTM and GRU.
+_DEFUN_API_NAME_ATTRIBUTE = 'api_implements'
+_DEFUN_DEVICE_ATTRIBUTE = 'api_preferred_device'
+_CPU_DEVICE_NAME = 'CPU'
+_GPU_DEVICE_NAME = 'GPU'
+
+
+@keras_export('keras.layers.GRU', v1=[])
+class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
+  """Gated Recurrent Unit - Cho et al. 2014.
+
+  Based on available runtime hardware and constraints, this layer
+  will choose different implementations (cuDNN-based or pure-TensorFlow)
+  to maximize the performance. If a GPU is available and all
+  the arguments to the layer meet the requirement of the CuDNN kernel
+  (see below for details), the layer will use a fast cuDNN implementation.
+
+  The requirements to use the cuDNN implementation are:
+
+  1. `activation` == 'tanh'
+  2. `recurrent_activation` == 'sigmoid'
+  3. `recurrent_dropout` == 0
+  4. `unroll` is False
+  5. `use_bias` is True
+  6. `reset_after` is True
+  7. No use of masking.
+
+  There are two variants of the GRU implementation. The default one is based on
+  [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
+  state before matrix multiplication. The other one is based on
+  [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.
+
+  The second variant is compatible with CuDNNGRU (GPU-only) and allows
+  inference on CPU. Thus it has separate biases for `kernel` and
+  `recurrent_kernel`. To use this variant, set `'reset_after'=True` and
+  `recurrent_activation='sigmoid'`.
+
+  Arguments:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+      for the recurrent step.
+      Default: sigmoid (`sigmoid`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+       weights matrix,
+       used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation")..
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+      the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1.
+      Fraction of the units to drop for the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+      Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2.
+      Mode 1 will structure its operations as a larger number of
+      smaller dot products and additions, whereas mode 2 will
+      batch them into fewer, larger operations. These modes will
+      have different performance profiles on different hardware and
+      for different applications.
+    return_sequences: Boolean. Whether to return the last output
+      in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+      in addition to the output.
+    go_backwards: Boolean (default False).
+      If True, process the input sequence backwards and return the
+      reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+      for each sample at index i in a batch will be used as initial
+      state for the sample of index i in the following batch.
+    unroll: Boolean (default False).
+      If True, the network will be unrolled,
+      else a symbolic loop will be used.
+      Unrolling can speed-up a RNN,
+      although it tends to be more memory-intensive.
+      Unrolling is only suitable for short sequences.
+    reset_after: GRU convention (whether to apply reset gate after or
+      before matrix multiplication). False = "before",
+      True = "after" (default and CuDNN compatible).
+
+  Call arguments:
+    inputs: A 3D tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is only relevant if `dropout` or
+      `recurrent_dropout` is used.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               time_major=False,
+               reset_after=True,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self._return_runtime = kwargs.pop('return_runtime', False)
+
+    super(GRU, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        unroll=unroll,
+        time_major=time_major,
+        reset_after=reset_after,
+        **kwargs)
+    # CuDNN uses following setting by default and not configurable.
+    self.could_use_cudnn = (
+        activation == 'tanh' and recurrent_activation == 'sigmoid' and
+        recurrent_dropout == 0 and not unroll and use_bias and
+        reset_after)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # GRU does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal GRU.
+      kwargs = {'training': training}
+
+      def step(cell_inputs, cell_states):
+        return self.cell.call(cell_inputs, cell_states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      # This is a dummy tensor for testing purpose.
+      runtime = _runtime('unknown')
+    else:
+      last_output, outputs, runtime, states = self._defun_gru_call(
+          inputs, initial_state, training)
+
+    if self.stateful:
+      updates = [state_ops.assign(self.states[0], states[0])]
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + list(states)
+    elif self._return_runtime:
+      return output, runtime
+    else:
+      return output
+
+  def _defun_gru_call(self, inputs, initial_state, training):
+    # Use the new defun approach for backend implementation swap.
+    # Note that different implementations need to have same function
+    # signature, eg, the tensor parameters need to have same shape and dtypes.
+    if self.go_backwards:
+      # Reverse time axis.
+      inputs = K.reverse(inputs, 0 if self.time_major else 1)
+
+    self.reset_dropout_mask()
+    dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
+    if dropout_mask is not None:
+      inputs *= dropout_mask[0]
+    if ops.executing_eagerly_outside_functions():
+      # Under eager context, the device placement is already known. Prefer the
+      # GPU implementation when GPU is available.
+      if context.num_gpus() > 0:
+        last_output, outputs, new_h, runtime = cudnn_gru(
+            inputs=inputs,
+            init_h=initial_state[0],
+            kernel=self.cell.kernel,
+            recurrent_kernel=self.cell.recurrent_kernel,
+            bias=self.cell.bias,
+            time_major=self.time_major)
+      else:
+        last_output, outputs, new_h, runtime = standard_gru(
+            inputs=inputs,
+            init_h=initial_state[0],
+            kernel=self.cell.kernel,
+            recurrent_kernel=self.cell.recurrent_kernel,
+            bias=self.cell.bias,
+            activation=self.activation,
+            recurrent_activation=self.recurrent_activation,
+            time_major=self.time_major)
+    else:
+      api_name = 'gru_' + str(uuid.uuid4())
+      defun_standard_gru = _generate_defun_backend(
+          api_name, _CPU_DEVICE_NAME, standard_gru)
+      defun_cudnn_gru = _generate_defun_backend(
+          api_name, _GPU_DEVICE_NAME, cudnn_gru)
+      # Call the normal GRU impl and register the CuDNN impl function. The
+      # grappler will kick in during session execution to optimize the graph.
+      last_output, outputs, new_h, runtime = defun_standard_gru(
+          inputs=inputs,
+          init_h=initial_state[0],
+          kernel=self.cell.kernel,
+          recurrent_kernel=self.cell.recurrent_kernel,
+          bias=self.cell.bias,
+          activation=self.activation,
+          recurrent_activation=self.recurrent_activation,
+          time_major=self.time_major)
+
+      function.register(defun_cudnn_gru, inputs, initial_state[0],
+                        self.cell.kernel, self.cell.recurrent_kernel,
+                        self.cell.bias, self.time_major)
+    states = [new_h]
+    return last_output, outputs, runtime, states
+
+
+def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
+                 recurrent_activation, time_major):
+  """GRU with standard kernel implementation.
+
+  This implementation can be run on all types of hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Arguments:
+    inputs: input tensor of GRU layer.
+    init_h: initial state tensor for the cell output.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. The bias contains the
+      combined input_bias and recurrent_bias.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  input_bias, recurrent_bias = array_ops.unstack(bias)
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]
+
+    # inputs projected by all gate matrices at once
+    matrix_x = K.dot(cell_inputs, kernel)
+    matrix_x = K.bias_add(matrix_x, input_bias)
+
+    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)
+
+    # hidden state projected by all gate matrices at once
+    matrix_inner = K.dot(h_tm1, recurrent_kernel)
+    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
+
+    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
+                                                            axis=1)
+    z = recurrent_activation(x_z + recurrent_z)
+    r = recurrent_activation(x_r + recurrent_r)
+    hh = activation(x_h + r * recurrent_h)
+
+    # previous and candidate state mixed by update gate
+    h = z * h_tm1 + (1 - z) * hh
+    return h, [h]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], _runtime('cpu')
+
+
+def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, time_major):
+  """GRU with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  init_h = array_ops.expand_dims(init_h, axis=0)
+
+  weights = array_ops.split(kernel, 3, axis=1)
+  weights += array_ops.split(recurrent_kernel, 3, axis=1)
+  # Note that the bias was initialized as shape (2, 3 * units), flat it into
+  # (6 * units)
+  bias = array_ops.split(K.flatten(bias), 6)
+  # Note that the gate order for CuDNN is different from the canonical format.
+  # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need to
+  # be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
+  # z is update gate weights.
+  # r is reset gate weights.
+  # h is output gate weights.
+  weights[0], weights[1] = weights[1], weights[0]
+  weights[3], weights[4] = weights[4], weights[3]
+  bias[0], bias[1] = bias[1], bias[0]
+  bias[3], bias[4] = bias[4], bias[3]
+
+  params = _canonical_to_params(
+      weights=weights,
+      biases=bias,
+      shape=constant_op.constant([-1]),
+      transpose_weights=True)
+
+  outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs,
+      input_h=init_h,
+      input_c=0,
+      params=params,
+      is_training=True,
+      rnn_mode='gru')
+  last_output = outputs[-1]
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  return last_output, outputs, h, _runtime('cudnn')
+
+
+@keras_export('keras.layers.LSTM', v1=[])
+class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
+  """Long Short-Term Memory layer - Hochreiter 1997.
+
+  Based on available runtime hardware and constraints, this layer
+  will choose different implementations (cuDNN-based or pure-TensorFlow)
+  to maximize the performance. If a GPU is available and all
+  the arguments to the layer meet the requirement of the CuDNN kernel
+  (see below for details), the layer will use a fast cuDNN implementation.
+
+  The requirements to use the cuDNN implementation are:
+
+  1. `activation` == 'tanh'
+  2. `recurrent_activation` == 'sigmoid'
+  3. `recurrent_dropout` == 0
+  4. `unroll` is False
+  5. `use_bias` is True
+  7. No use of masking.
+
+  Arguments:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
+      is applied (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use for the recurrent step.
+      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
+      applied (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix, used for
+      the linear transformation of the inputs..
+    recurrent_initializer: Initializer for the `recurrent_kernel` weights
+      matrix, used for the linear transformation of the recurrent state..
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
+      initialization. Setting it to true will also force
+      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
+    kernel_regularizer: Regularizer function applied to the `kernel` weights
+      matrix.
+    recurrent_regularizer: Regularizer function applied to the
+      `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to the output of the
+      layer (its "activation")..
+    kernel_constraint: Constraint function applied to the `kernel` weights
+      matrix.
+    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
+      weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
+      transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
+      its operations as a larger number of smaller dot products and additions,
+      whereas mode 2 will batch them into fewer, larger operations. These modes
+      will have different performance profiles on different hardware and for
+      different applications.
+    return_sequences: Boolean. Whether to return the last output. in the output
+      sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state in addition to the
+      output.
+    go_backwards: Boolean (default False). If True, process the input sequence
+      backwards and return the reversed sequence.
+    stateful: Boolean (default False). If True, the last state for each sample
+      at index i in a batch will be used as initial state for the sample of
+      index i in the following batch.
+    unroll: Boolean (default False). If True, the network will be unrolled, else
+      a symbolic loop will be used. Unrolling can speed-up a RNN, although it
+      tends to be more memory-intensive. Unrolling is only suitable for short
+      sequences.
+
+  Call arguments:
+    inputs: A 3D tensor.
+    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+      a given timestep should be masked.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. This argument is passed to the cell
+      when calling it. This is only relevant if `dropout` or
+      `recurrent_dropout` is used.
+    initial_state: List of initial state tensors to be passed to the first
+      call of the cell.
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               time_major=False,
+               unroll=False,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self.return_runtime = kwargs.pop('return_runtime', False)
+
+    super(LSTM, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        unit_forget_bias=unit_forget_bias,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        time_major=time_major,
+        unroll=unroll,
+        **kwargs)
+
+    self.state_spec = [
+        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
+    ]
+    self.could_use_cudnn = (
+        activation == 'tanh' and recurrent_activation == 'sigmoid' and
+        recurrent_dropout == 0 and not unroll and use_bias)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # LSTM does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal LSTM.
+      kwargs = {'training': training}
+
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      runtime = _runtime('unknown')
+    else:
+      # Use the new defun approach for backend implementation swap.
+      # Note that different implementations need to have same function
+      # signature, eg, the tensor parameters need to have same shape and dtypes.
+      # Since the CuDNN has an extra set of bias, those bias will be passed to
+      # both normal and CuDNN implementations.
+      if self.go_backwards:
+        # Reverse time axis.
+        inputs = K.reverse(inputs, 0 if self.time_major else 1)
+
+      self.reset_dropout_mask()
+      dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
+      if dropout_mask is not None:
+        inputs *= dropout_mask[0]
+
+      if ops.executing_eagerly_outside_functions():
+        # Under eager context, the device placement is already known. Prefer the
+        # GPU implementation here.
+        if context.num_gpus() > 0:
+          last_output, outputs, new_h, new_c, runtime = cudnn_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.time_major)
+        else:
+          last_output, outputs, new_h, new_c, runtime = standard_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.activation,
+              self.recurrent_activation, self.time_major)
+      else:
+        # Each time a `tf.function` is called, we will give it a unique
+        # identifiable API name, so that Grappler won't get confused when it
+        # sees multiple LSTM layers added into same graph, and it will be able
+        # to pair up the different implementations across them.
+        api_name = 'lstm_' + str(uuid.uuid4())
+        defun_standard_lstm = _generate_defun_backend(
+            api_name, _CPU_DEVICE_NAME, standard_lstm)
+        defun_cudnn_lstm = _generate_defun_backend(
+            api_name, _GPU_DEVICE_NAME, cudnn_lstm)
+
+        # Call the normal LSTM impl and register the CuDNN impl function. The
+        # grappler will kick in during session execution to optimize the graph.
+        last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+            inputs, initial_state[0], initial_state[1], self.cell.kernel,
+            self.cell.recurrent_kernel, self.cell.bias, self.activation,
+            self.recurrent_activation, self.time_major)
+
+        function.register(defun_cudnn_lstm, inputs, initial_state[0],
+                          initial_state[1], self.cell.kernel,
+                          self.cell.recurrent_kernel, self.cell.bias,
+                          self.time_major)
+      states = [new_h, new_c]
+
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(state_ops.assign(self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + list(states)
+    elif self.return_runtime:
+      return output, runtime
+    else:
+      return output
+
+
+def _canonical_to_params(weights, biases, shape, transpose_weights=False):
+  """Utility function convert variable to CuDNN compatible parameter.
+
+  Note that Keras weights for kernels are different from the CuDNN format. Eg.:
+
+  ```
+    Keras                 CuDNN
+    [[0, 1, 2],  <--->  [[0, 2, 4],
+     [3, 4, 5]]          [1, 3, 5]]
+  ```
+
+  If the input weights need to be in a unified format, then set
+  `transpose_weights=True` to convert the weights.
+
+  Args:
+    weights: list of weights for the individual kernels and recurrent kernels.
+    biases: list of biases for individual gate.
+    shape: the shape for the converted variables that will be feed to CuDNN.
+    transpose_weights: boolean, whether to transpose the weights.
+
+  Returns:
+    The converted weights that can be feed to CuDNN ops as param.
+  """
+  def convert(w):
+    return array_ops.transpose(w) if transpose_weights else w
+
+  weights = [array_ops.reshape(convert(x), shape) for x in weights]
+  biases = [array_ops.reshape(x, shape) for x in biases]
+  return array_ops.concat(weights + biases, axis=0)
+
+
+def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
+                  activation, recurrent_activation, time_major):
+  """LSTM with standard kernel implementation.
+
+  This implementation can be run on all types for hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Note that the first half of the bias tensor should be ignored by this impl.
+  The CuDNN impl need an extra set of input gate bias. In order to make the both
+  function take same shape of parameter, that extra set of bias is also feed
+  here.
+
+  Args:
+    inputs: input tensor of LSTM layer.
+    init_h: initial state tensor for the cell output.
+    init_c: initial state tensor for the cell hidden state.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
+      is used in this case.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    state_1: the cell hidden state, which has same shape as init_c.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]  # previous memory state
+    c_tm1 = cell_states[1]  # previous carry state
+
+    z = K.dot(cell_inputs, kernel)
+    z += K.dot(h_tm1, recurrent_kernel)
+    z = K.bias_add(z, bias)
+
+    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
+
+    i = recurrent_activation(z0)
+    f = recurrent_activation(z1)
+    c = f * c_tm1 + i * activation(z2)
+    o = recurrent_activation(z3)
+
+    h = o * activation(c)
+    return h, [h, c]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h, init_c],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], new_states[1], _runtime('cpu')
+
+
+def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
+               time_major):
+  """LSTM with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  input_h = array_ops.expand_dims(input_h, axis=0)
+  input_c = array_ops.expand_dims(input_c, axis=0)
+
+  weights = array_ops.split(kernel, 4, axis=1)
+  weights += array_ops.split(recurrent_kernel, 4, axis=1)
+  # CuDNN has an extra set of bias for inputs, we disable them (setting to 0),
+  # so that mathematically it is same as the canonical LSTM implementation.
+  full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
+
+  params = _canonical_to_params(
+      weights=weights,
+      biases=array_ops.split(full_bias, 8),
+      shape=constant_op.constant([-1]),
+      transpose_weights=True)
+
+  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs, input_h=input_h, input_c=input_c, params=params, is_training=True)
+  last_output = outputs[-1]
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  c = c[0]
+
+  return last_output, outputs, h, c, _runtime('cudnn')
+
+
+def _generate_defun_backend(unique_api_name, preferred_device, func):
+  function_attributes = {
+      _DEFUN_API_NAME_ATTRIBUTE: unique_api_name,
+      _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
+  }
+  return function.defun_with_attributes(func=func,
+                                        attributes=function_attributes)
+
+
+def _runtime(runtime_name):
+  with ops.device('/cpu:0'):
+    return constant_op.constant(
+        runtime_name, dtype=dtypes.string, name='runtime')
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index aa3b57b..a651f7f 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -16,6 +16,7 @@
 """
 # pylint: disable=wildcard-import
 # pylint: disable=unused-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -40,21 +41,14 @@
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.util.tf_export import keras_export
 
+if tf2.enabled():
+  from tensorflow.python.keras.layers.recurrent_v2 import *     # pylint: disable=g-import-not-at-top
+
 # TODO(b/124791387): replace mapping with layer attribute.
 # Name conversion between class name and API symbol in config.
 _SERIALIZATION_TABLE = {
     'BatchNormalizationV1': 'BatchNormalization',
     'BatchNormalizationV2': 'BatchNormalization',
-    'UnifiedLSTM': 'LSTM',
-    'UnifiedGRU': 'GRU',
-}
-
-# Name conversion between API symbol in config and class name.
-# Note that the class names is a list where the first item is v1 class name and
-# the second item is the v2 class name.
-_DESERIALIZATION_TABLE = {
-    'LSTM': {'v1': 'LSTM', 'v2': 'UnifiedLSTM'},
-    'GRU': {'v1': 'GRU', 'v2': 'UnifiedGRU'},
 }
 
 
@@ -83,10 +77,6 @@
   globs['Network'] = models.Network
   globs['Model'] = models.Model
   globs['Sequential'] = models.Sequential
-  layer_class_name = config['class_name']
-  if layer_class_name in _DESERIALIZATION_TABLE:
-    version = 'v2' if tf2.enabled() else 'v1'
-    config['class_name'] = _DESERIALIZATION_TABLE[layer_class_name][version]
 
   return deserialize_keras_object(
       config,
diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index eaaa447..ab86529 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -23,6 +23,8 @@
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras.layers import recurrent as rnn_v1
+from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
 from tensorflow.python.platform import test
 
 
@@ -61,7 +63,7 @@
     self.assertEqual(new_layer.gamma_regularizer.__class__,
                      keras.regularizers.L1L2)
 
-  @parameterized.parameters([keras.layers.LSTM, keras.layers.UnifiedLSTM])
+  @parameterized.parameters([rnn_v1.LSTM, rnn_v2.LSTM])
   def test_serialize_deserialize_lstm(self, layer):
     lstm = layer(5, return_sequences=True)
     config = keras.layers.serialize(lstm)
@@ -70,11 +72,12 @@
     self.assertEqual(new_layer.units, 5)
     self.assertEqual(new_layer.return_sequences, True)
     if tf2.enabled():
-      self.assertIsInstance(new_layer, keras.layers.UnifiedLSTM)
+      self.assertIsInstance(new_layer, rnn_v2.LSTM)
     else:
-      self.assertIsInstance(new_layer, keras.layers.LSTM)
+      self.assertIsInstance(new_layer, rnn_v1.LSTM)
+      self.assertNotIsInstance(new_layer, rnn_v2.LSTM)
 
-  @parameterized.parameters([keras.layers.GRU, keras.layers.UnifiedGRU])
+  @parameterized.parameters([rnn_v1.GRU, rnn_v2.GRU])
   def test_serialize_deserialize_gru(self, layer):
     gru = layer(5, return_sequences=True)
     config = keras.layers.serialize(gru)
@@ -83,9 +86,10 @@
     self.assertEqual(new_layer.units, 5)
     self.assertEqual(new_layer.return_sequences, True)
     if tf2.enabled():
-      self.assertIsInstance(new_layer, keras.layers.UnifiedGRU)
+      self.assertIsInstance(new_layer, rnn_v2.GRU)
     else:
-      self.assertIsInstance(new_layer, keras.layers.GRU)
+      self.assertIsInstance(new_layer, rnn_v1.GRU)
+      self.assertNotIsInstance(new_layer, rnn_v2.GRU)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index 993f5a9..d0c5f25 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -221,7 +221,7 @@
     size_500 = _construct_graph_of_size(500)
 
     # Check construction time grows approx. linearly with size.
-    e = 2  # Fudge factor to prevent flakiness.
+    e = 3  # Fudge factor to prevent flakiness.
     self.assertLess(size_500, (10 * e) * size_50)
 
   def test_no_mask_tracking(self):
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 0121410..e4371c2 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -88,7 +88,6 @@
   tensor_map = {}  # Map {reference_tensor: corresponding_tensor}
   if input_tensors is None:
     # Create placeholders to build the model on top of.
-    input_layers = []
     input_tensors = []
     for layer in model._input_layers:
       input_tensor = Input(
@@ -100,10 +99,6 @@
       # Cache newly created input layer.
       newly_created_input_layer = input_tensor._keras_history[0]
       layer_map[layer] = newly_created_input_layer
-
-    for original_input_layer, cloned_input_layer in zip(model._input_layers,
-                                                        input_layers):
-      layer_map[original_input_layer] = cloned_input_layer
   else:
     # Make sure that all input tensors come from a Keras layer.
     # If tensor comes from an input layer: cache the input layer.
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index bebc603..3b6d30a 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -36,6 +36,7 @@
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
@@ -441,11 +442,18 @@
           update_ops.extend(
               distribution.extended.update(
                   var, apply_grad_to_update_var, args=(grad,), group=False))
-      with ops.control_dependencies(update_ops):
-        apply_updates = self._iterations.assign_add(1)
-      if not context.executing_eagerly():
-        apply_updates = apply_updates.op
-      return apply_updates
+
+      any_symbolic = any(isinstance(i, ops.Operation) or
+                         tf_utils.is_symbolic_tensor(i) for i in update_ops)
+      if not context.executing_eagerly() or any_symbolic:
+        # If the current context is graph mode or any of the update ops are
+        # symbolic then the step update should be carried out under a graph
+        # context. (eager updates execute immediately)
+        with ops._get_graph_from_inputs(update_ops).as_default():  # pylint: disable=protected-access
+          with ops.control_dependencies(update_ops):
+            return self._iterations.assign_add(1).op
+
+      return self._iterations.assign_add(1)
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index 8c0fd2f..55ddde5 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -72,11 +72,15 @@
           'set. Usually, input shapes are automatically determined from calling'
           ' .fit() or .predict(). To manually set the shapes, call '
           'model._set_inputs(inputs).'.format(model))
-    input_specs = []
-    for input_tensor, input_name in zip(inputs, input_names):
-      input_specs.append(tensor_spec.TensorSpec(
+    flat_inputs = nest.flatten(inputs)
+    flat_input_names = nest.flatten(input_names)
+    flat_input_specs = []
+    for input_tensor, input_name in zip(flat_inputs, flat_input_names):
+      flat_input_specs.append(tensor_spec.TensorSpec(
           shape=input_tensor.shape, dtype=input_tensor.dtype,
           name=input_name))
+    input_specs = nest.pack_sequence_as(structure=inputs,
+                                        flat_sequence=flat_input_specs)
     # The input signature of the call function is a list with one element, since
     # all tensor inputs must be passed in as the first argument.
     input_signature = [input_specs] if len(input_specs) > 1 else input_specs
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index d396851..b34bed5 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -20,6 +20,7 @@
 from __future__ import print_function
 
 import os
+import sys
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -43,11 +44,11 @@
     # Attempt to create an image of a blank graph
     # to check the pydot/graphviz installation.
     pydot.Dot.create(pydot.Dot())
-  except Exception:
+    return True
+  except Exception:  # pylint: disable=broad-except
     # pydot raises a generic Exception here,
     # so no specific class can be caught.
-    raise ImportError('Failed to import pydot. You must install pydot'
-                      ' and graphviz for `pydotprint` to work.')
+    return False
 
 
 def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
@@ -63,13 +64,28 @@
           'LR' creates a horizontal plot.
 
   Returns:
-      A `pydot.Dot` instance representing the Keras model.
+      A `pydot.Dot` instance representing the Keras model (or None if the Dot
+      file could not be generated).
+
+  Raises:
+    ImportError: if graphviz or pydot are not available.
   """
   from tensorflow.python.keras.layers.wrappers import Wrapper
   from tensorflow.python.keras.models import Sequential
   from tensorflow.python.util import nest
 
-  _check_pydot()
+  check = _check_pydot()
+  if not check:
+    if 'IPython.core.magics.namespace' in sys.modules:
+      # We don't raise an exception here in order to avoid crashing notebook
+      # tests where graphviz is not available.
+      print('Failed to import pydot. You must install pydot'
+            ' and graphviz for `pydotprint` to work.')
+      return
+    else:
+      raise ImportError('Failed to import pydot. You must install pydot'
+                        ' and graphviz for `pydotprint` to work.')
+
   dot = pydot.Dot()
   dot.set('rankdir', rankdir)
   dot.set('concentrate', True)
@@ -151,6 +167,8 @@
       This enables in-line display of the model plots in notebooks.
   """
   dot = model_to_dot(model, show_shapes, show_layer_names, rankdir)
+  if dot is None:
+    return
   _, extension = os.path.splitext(to_file)
   if not extension:
     extension = 'png'
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3bd7e2f..cec51dd 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -829,6 +829,8 @@
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
     ],
+    # TODO(b/128347673): Re-enable.
+    tags = ["no_windows"],
     xla_enable_strict_auto_jit = True,
 )
 
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index e87ccca..aa207eb 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 import numpy as np
 
@@ -839,12 +840,18 @@
         1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0,
         0.0, 1.0, 0.0, 1.0
     ]
-    configs = [[False, False, [0, 1, 3, 5, 0, 2, 6, 8]],
-               [False, True, [0, 1, 3, 5, 9, 11, 15, 17]],
-               [True, False, [0, 1, 3, 5, 0, 2, 6, 8]]]
 
-    for use_gpu, include_batch_in_index, argmax_exp in configs:
-      with GetDeviceScope(self, use_gpu=use_gpu):
+    Config = collections.namedtuple(
+        "Config", ["use_gpu", "include_batch_in_index", "argmax"])
+    configs = [
+        Config(False, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(False, True, [0, 1, 3, 5, 9, 11, 15, 17]),
+        Config(True, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(True, True, [0, 1, 3, 5, 9, 11, 15, 17])
+    ]
+
+    for config in configs:
+      with GetDeviceScope(self, use_gpu=config.use_gpu):
         t = constant_op.constant(tensor_input, shape=[2, 3, 3, 1])
         out_op, argmax_op = nn_ops.max_pool_with_argmax(
             t,
@@ -852,13 +859,13 @@
             strides=[1, 1, 1, 1],
             Targmax=dtypes.int64,
             padding="VALID",
-            include_batch_in_index=include_batch_in_index)
+            include_batch_in_index=config.include_batch_in_index)
         out, argmax = self.evaluate([out_op, argmax_op])
         self.assertShapeEqual(out, out_op)
         self.assertShapeEqual(argmax, argmax_op)
         self.assertAllClose(out.ravel(),
                             [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
-        self.assertAllEqual(argmax.ravel(), argmax_exp)
+        self.assertAllEqual(argmax.ravel(), config.argmax)
 
   def testMaxPoolingGradWithArgmax(self):
     orig_input = [
@@ -867,16 +874,21 @@
     ]
     tensor_input = [11.0, 12.0, 13.0, 14.0, 21.0, 22.0, 23.0, 24.0]
 
-    configs = [[False, False, [0, 1, 3, 5, 0, 2, 6, 8]],
-               [False, True, [0, 1, 3, 5, 9, 11, 15, 17]],
-               [True, False, [0, 1, 3, 5, 0, 2, 6, 8]]]
+    Config = collections.namedtuple(
+        "Config", ["use_gpu", "include_batch_in_index", "argmax"])
+    configs = [
+        Config(False, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(False, True, [0, 1, 3, 5, 9, 11, 15, 17]),
+        Config(True, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(True, True, [0, 1, 3, 5, 9, 11, 15, 17])
+    ]
 
-    for use_gpu, include_batch_in_index, argmax in configs:
-      with GetDeviceScope(self, use_gpu):
+    for config in configs:
+      with GetDeviceScope(self, config.use_gpu):
         orig_in = constant_op.constant(orig_input, shape=[2, 3, 3, 1])
         t = constant_op.constant(tensor_input, shape=[2, 2, 2, 1])
         argmax_t = constant_op.constant(
-            argmax, shape=[2, 2, 2, 1], dtype=dtypes.int64)
+            config.argmax, shape=[2, 2, 2, 1], dtype=dtypes.int64)
         out_op = gen_nn_ops.max_pool_grad_with_argmax(
             orig_in,
             t,
@@ -884,7 +896,7 @@
             ksize=[1, 2, 2, 1],
             strides=[1, 1, 1, 1],
             padding="VALID",
-            include_batch_in_index=include_batch_in_index)
+            include_batch_in_index=config.include_batch_in_index)
         out = self.evaluate(out_op).flatten()
         self.assertAllClose(out, [
             11.0, 12.0, 0.0, 13.0, 0.0, 14.0, 0.0, 0.0, 0.0, 21.0, 0.0, 22.0,
@@ -903,22 +915,31 @@
         11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 21.0, 22.0, 23.0,
         24.0, 25.0, 26.0, 27.0, 28.0, 29.0
     ]
-    tensor_argmax = list(np.array([0, 1, 3, 5, 0, 2, 6, 8], dtype=np.int64))
-    with self.session(use_gpu=True):
-      orig_in = constant_op.constant(orig_input, shape=[2, 3, 3, 1])
-      t = constant_op.constant(tensor_input, shape=[2, 3, 3, 1])
-      argmax = constant_op.constant(
-          tensor_argmax, shape=[2, 2, 2, 1], dtype=dtypes.int64)
-      out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
-          orig_in,
-          t,
-          argmax,
-          ksize=[1, 2, 2, 1],
-          strides=[1, 1, 1, 1],
-          padding="VALID",
-          include_batch_in_index=False)
-      out = self.evaluate(out_op).flatten()
-      self.assertAllClose(out, [11.0, 12.0, 14.0, 16.0, 21.0, 23.0, 27.0, 29.0])
+
+    Config = collections.namedtuple(
+        "Config", ["use_gpu", "include_batch_in_index", "argmax"])
+    configs = [
+        Config(True, False, [0, 1, 3, 5, 0, 2, 6, 8]),
+        Config(True, True, [0, 1, 3, 5, 9, 11, 15, 17])
+    ]
+
+    for config in configs:
+      with GetDeviceScope(self, config.use_gpu):
+        orig_in = constant_op.constant(orig_input, shape=[2, 3, 3, 1])
+        t = constant_op.constant(tensor_input, shape=[2, 3, 3, 1])
+        argmax_t = constant_op.constant(
+            config.argmax, shape=[2, 2, 2, 1], dtype=dtypes.int64)
+        out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
+            orig_in,
+            t,
+            argmax_t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 1, 1, 1],
+            padding="VALID",
+            include_batch_in_index=config.include_batch_in_index)
+        out = self.evaluate(out_op).flatten()
+        self.assertAllClose(out,
+                            [11.0, 12.0, 14.0, 16.0, 21.0, 23.0, 27.0, 29.0])
 
   def _ConstructAndTestGradient(self,
                                 pool_func,
diff --git a/tensorflow/python/kernel_tests/signal/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
index 9b66f3e..c3a2887 100644
--- a/tensorflow/python/kernel_tests/signal/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -468,7 +468,6 @@
                 gen_complex(complex_dims), rank, (size,) * rank)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("b/123738986")  # More assertions needed.
   def testError(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index fedf8e4..e978f1d 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -206,7 +206,6 @@
                                   2)
 
   @test_util.run_deprecated_v1
-  @test_util.disable_xla("This test never passed for XLA")
   def testBatchNormGradImpl(self):
     x_shape = [7, 5, 4, 6]
     param_shape = [6]
diff --git a/tensorflow/python/ops/raw_ops_test.py b/tensorflow/python/ops/raw_ops_test.py
index 6e47802..a504679 100644
--- a/tensorflow/python/ops/raw_ops_test.py
+++ b/tensorflow/python/ops/raw_ops_test.py
@@ -29,14 +29,18 @@
 class RawOpsTest(test.TestCase):
 
   def testSimple(self):
-
-    with self.assertRaisesRegexp(TypeError, "only takes keyword args"):
-      _ = gen_math_ops.Add(1., 1.)
-
     x = constant_op.constant(1)
     self.assertEqual([2], self.evaluate(gen_math_ops.Add(x=x, y=x)))
 
+  def testRequiresKwargs(self):
+    with self.assertRaisesRegexp(TypeError, "only takes keyword args"):
+      gen_math_ops.Add(1., 1.)
 
-if __name__ == '__main__':
+  def testRequiresKwargs_providesSuggestion(self):
+    msg = "possible keys: \\['x', 'y'\\]"
+    with self.assertRaisesRegexp(TypeError, msg):
+      gen_math_ops.Add(1., y=2.)
+
+if __name__ == "__main__":
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index 9f9da56..6198aa6 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -61,17 +61,21 @@
 THREEFRY_STATE_SIZE = 2
 
 
-def non_deterministic_seed():
-  """Makes a non-deterministic seed.
+def non_deterministic_ints(shape, dtype=dtypes.int64):
+  """Non-deterministically generates some integers.
 
-  The implementation will be changed soon from pure Python to an op.
+  This op may use some OS-provided source of non-determinism (e.g. an RNG), so
+  each execution will give different results.
+
+  Args:
+    shape: the shape of the result.
+    dtype: (optional) the dtype of the result.
 
   Returns:
-    a 1-D tensor.
+    a tensor whose element values are non-deterministically chosen.
   """
-  return np.random.randint(
-      low=SEED_MIN, high=SEED_MAX + 1, size=SEED_SIZE,
-      dtype=SEED_TYPE)
+  return gen_stateful_random_ops.non_deterministic_ints(
+      shape=shape, dtype=dtype)
 
 
 def _uint_to_int(n):
@@ -116,37 +120,17 @@
   return seed
 
 
-def _make_philox_state(seed):
-  """Makes a RNG state for Philox algorithm.
-
-  Args:
-    seed: an integer or 1-D tensor.
-
-  Returns:
-    a 1-D tensor.
-  """
-  return _make_1d_state(PHILOX_STATE_SIZE, seed)
-
-
-def _make_threefry_state(seed):
-  """Makes a RNG state for ThreeFry algorithm.
-
-  Args:
-    seed: an integer or 1-D tensor.
-
-  Returns:
-    a 1-D tensor.
-  """
-  return _make_1d_state(THREEFRY_STATE_SIZE, seed)
-
-
-def _make_state_from_seed(seed, algorithm):
-  if algorithm == RNG_ALG_PHILOX:
-    return _make_philox_state(seed)
-  elif algorithm == RNG_ALG_THREEFRY:
-    return _make_threefry_state(seed)
+def _get_state_size(alg):
+  if alg == RNG_ALG_PHILOX:
+    return PHILOX_STATE_SIZE
+  elif alg == RNG_ALG_THREEFRY:
+    return THREEFRY_STATE_SIZE
   else:
-    raise ValueError("Unsupported algorithm id: %s" % algorithm)
+    raise ValueError("Unsupported algorithm id: %s" % alg)
+
+
+def _make_state_from_seed(seed, alg):
+  return _make_1d_state(_get_state_size(alg), seed)
 
 
 @tf_export("random.create_rng_state")
@@ -190,12 +174,14 @@
                  auto-selected.
     """
     if copy_from is None:
-      if seed is None:
-        seed = non_deterministic_seed()
       if algorithm is None:
         # TODO(wangpeng): more sophisticated algorithm selection
         algorithm = DEFAULT_ALGORITHM
-      state = create_rng_state(seed, algorithm)
+      if seed is None:
+        state = non_deterministic_ints(shape=[_get_state_size(algorithm)],
+                                       dtype=SEED_TYPE)
+      else:
+        state = create_rng_state(seed, algorithm)
       self._state_var = variables.Variable(state, dtype=STATE_TYPE)
       self._alg_var = algorithm
     else:
@@ -220,6 +206,10 @@
   def algorithm(self):
     return self._alg_var
 
+  def _standard_normal(self, shape, dtype):
+    return gen_stateful_random_ops.stateful_standard_normal_v2(
+        self.state.handle, self.algorithm, shape, dtype=dtype)
+
   # The following functions return a tensor and as a side effect update
   # self._state_var.
   def normal(self, shape, mean=0.0, stddev=1.0, dtype=dtypes.float32,
@@ -228,8 +218,7 @@
       shape = _shape_tensor(shape)
       mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
       stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
-      rnd = gen_stateful_random_ops.stateful_standard_normal_v2(
-          self.state.handle, self.algorithm, shape, dtype=dtype)
+      rnd = self._standard_normal(shape, dtype=dtype)
       return math_ops.add(rnd * stddev, mean, name=name)
 
   def uniform(self, shape, minval=0, maxval=None,
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
index 4f3a962..ea1cebd 100644
--- a/tensorflow/python/ops/stateful_random_ops_test.py
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -35,6 +35,10 @@
 from tensorflow.python.platform import test
 
 
+g_seeded = None
+g_unseeded = None
+
+
 class StatefulRandomOpsTest(test.TestCase):
 
   def testCreateRNGStateIntSeed(self):
@@ -49,6 +53,75 @@
         state)
 
   @test_util.run_v2_only
+  def testNonDeterministicInts(self):
+    """Tests that non_deterministic_ints returns different results every time.
+
+    This test is flaky, but with very low probability of failing.
+    """
+    shape = [2, 3]
+    dtype = dtypes.uint64
+    a = random.non_deterministic_ints(shape=shape, dtype=dtype)
+    self.assertAllEqual(shape, a.shape)
+    self.assertEqual(dtype, a.dtype)
+    b = random.non_deterministic_ints(shape, dtype=dtype)
+    self.assertNotAllClose(a, b)
+
+  @test_util.run_v2_only
+  def testGeneratorCreationInDefun(self):
+    """Tests creating a Generator in defun.
+
+    The interaction between Generator creation and defun should be the same as
+    tf.Variable.
+    """
+    seed = 1234
+    shape = [2, 3]
+    with ops.device("/device:CPU:0"):
+      gen = random.Generator(seed=seed)
+      expected_normal1 = gen.normal(shape)
+      expected_normal2 = gen.normal(shape)
+      @def_function.function
+      def f():
+        global g_seeded
+        global g_unseeded
+        # defun'ed function should only create variables once
+        if g_seeded is None:
+          g_seeded = random.Generator(seed=seed)
+        if g_unseeded is None:
+          g_unseeded = random.Generator()
+        r = g_seeded.normal(shape)
+        r = (r, g_unseeded.normal(shape))
+        return r
+      def check_results(expected_normal, v1, v2):
+        self.assertAllEqual(expected_normal, v1)
+        self.assertAllEqual(shape, v2.shape)
+      check_results(expected_normal1, *f())
+      check_results(expected_normal2, *f())
+
+  @test_util.run_v1_only
+  def testTF1(self):
+    seed = 1234
+    shape = [2, 3]
+    expected_normal1 = constant_op.constant(
+        [[0.9356609, 1.0854305, -0.93788373],
+         [-0.50615472, 1.31697023, 0.71375787]], dtype=dtypes.float32)
+    expected_normal2 = constant_op.constant(
+        [[-0.3964749, 0.8369565, -0.30946946],
+         [1.1206646, 1.00852597, -0.10185789]], dtype=dtypes.float32)
+    with self.cached_session() as sess:
+      gen1 = random.Generator(seed=seed)
+      gen2 = random.Generator()
+      sess.run((gen1._state_var.initializer, gen2._state_var.initializer))
+      r1 = gen1.normal(shape)
+      r2 = gen2.normal(shape)
+      def f():
+        return sess.run((r1, r2))
+      def check_results(expected_normal, v1, v2):
+        self.assertAllEqual(expected_normal, v1)
+        self.assertAllEqual(shape, v2.shape)
+      check_results(expected_normal1, *f())
+      check_results(expected_normal2, *f())
+
+  @test_util.run_v2_only
   @test_util.also_run_as_tf_function
   def testEagerAndDefun(self):
     """A simple test to make sure the op works in eager and defunned mode."""
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 0aaf712..f1c448c 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -28,10 +28,12 @@
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
+from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras.engine import sequential
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
@@ -1110,6 +1112,21 @@
 
     self.assertEqual([2], root.f([2]).numpy())
 
+  def test_dense_features_layer(self, cycles):
+    columns = [feature_column_v2.numeric_column("x"),
+               feature_column_v2.numeric_column("y")]
+    layer = feature_column_v2.DenseFeatures(columns)
+    model = sequential.Sequential([layer])
+    model_input = {"x": constant_op.constant([[1.]]),
+                   "y": constant_op.constant([[2.]])}
+    self.assertAllClose([[1., 2.]], model.predict(model_input))
+    loaded = self.cycle(model, cycles)
+    output, = loaded._default_save_signature(model_input).values()
+    self.assertAllClose([[1., 2.]], output)
+    signature_output, = loaded.signatures["serving_default"](
+        **model_input).values()
+    self.assertAllClose([[1., 2.]], signature_output)
+
 
 class SingleCycleTests(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 3ebd370..cdd151b 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -125,7 +125,6 @@
         "__init__.py",
         "bfloat16.py",
         "device_assignment.py",
-        "profile_logger.py",
         "session_support.py",
         "tensor_tracer.py",
         "topology.py",
diff --git a/tensorflow/python/tpu/profile_logger.py b/tensorflow/python/tpu/profile_logger.py
deleted file mode 100644
index ef648b5..0000000
--- a/tensorflow/python/tpu/profile_logger.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ========================================================================
-"""A logger for profiling events."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.framework.summary_pb2 import Summary
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary.writer import writer
-
-
-class ProfileLogger(object):
-  """For logging profiling events."""
-
-  def _set_summary_dir(self, model_dir):
-    """Sets the summary directory to be model_dir/profile_logger.summary."""
-    if model_dir is None:
-      self._summary_dir = None
-      self._summary_writer = None
-      logging.warning('profile_logger: model_dir is None.'
-                      'So nowhere to write summaries')
-      return
-    self._summary_dir = model_dir + '/profile_logger.summary'
-    try:
-      self._summary_writer = writer.FileWriter(self._summary_dir)
-      logging.info('profile_logger(): set the summary directory to %s',
-                   self._summary_dir)
-    except Exception:  # pylint: disable=broad-except
-      logging.warning('profile_logger(): failed to create %s',
-                      self._summary_dir)
-      self._summary_dir = None
-      self._summary_writer = None
-
-  def __init__(self, model_dir):
-    self._set_summary_dir(model_dir)
-
-  def log_event(self, event, phase):
-    """Logs the given event to the summary directory."""
-
-    event_name = event + '_' + phase
-    if self._summary_writer is None:
-      logging.warning('profile_logger: cannot log event "%s" '
-                      ' because of no summary directory', event_name)
-      return
-
-    # For now, we only need the event timestamp. No need to pass any value.
-    s = Summary(value=[Summary.Value(tag=event_name, simple_value=0.0)])
-    self._summary_writer.add_summary(s)
-    self._summary_writer.flush()
-    logging.info('profile_logger: log event "%s"', event_name)
-
diff --git a/tensorflow/python/tpu/tpu_estimator.py b/tensorflow/python/tpu/tpu_estimator.py
index 9980b85..fb32365 100644
--- a/tensorflow/python/tpu/tpu_estimator.py
+++ b/tensorflow/python/tpu/tpu_estimator.py
@@ -62,7 +62,6 @@
 from tensorflow.python.tpu import _tpu_estimator_embedding
 from tensorflow.python.tpu import error_handling
 from tensorflow.python.tpu import functional as tpu_functional
-from tensorflow.python.tpu import profile_logger
 from tensorflow.python.tpu import session_support
 from tensorflow.python.tpu import tensor_tracer
 from tensorflow.python.tpu import tpu
@@ -440,7 +439,6 @@
                enqueue_ops,
                dequeue_ops,
                tpu_compile_op,
-               prof_logger,
                run_infeed_loop_on_coordinator=True,
                rendezvous=None,
                master=None,
@@ -470,7 +468,6 @@
     # initialization.
     self._should_initialize_tpu = not ctx.model_parallelism_enabled
     self._tpu_compile_op = tpu_compile_op
-    self._profile_logger = prof_logger
 
   def begin(self):
     logging.info('TPU job name %s', self._master_job)
@@ -531,7 +528,6 @@
     if self._should_initialize_tpu:
       logging.info('Init TPU system')
       start = time.time()
-      self._profile_logger.log_event('init_system', 'begin')
       with ops.Graph().as_default():
         with tf_session.Session(
             self._master, config=self._session_config) as sess:
@@ -539,7 +535,6 @@
               tpu.initialize_system(
                   job=self._master_job,
                   embedding_config=self._embedding_layer_config))
-      self._profile_logger.log_event('init_system', 'end')
       logging.info('Initialized TPU in %d seconds', time.time() - start)
 
     session.run(self._init_ops,
@@ -589,15 +584,13 @@
 
 class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op, prof_logger,
-               rendezvous=None, master=None,
-               session_config=None):
+  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
+               rendezvous=None, master=None, session_config=None):
     super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
         ctx,
         enqueue_ops,
         dequeue_ops,
         tpu_compile_op=tpu_compile_op,
-        prof_logger=prof_logger,
         run_infeed_loop_on_coordinator=False,
         rendezvous=rendezvous,
         master=master,
@@ -2382,7 +2375,6 @@
 
     self._is_input_fn_invoked = None
     self._rendezvous = {}
-    self._profile_logger = profile_logger.ProfileLogger(self.model_dir)
 
   def _add_meta_graph_for_mode(self,
                                builder,
@@ -2713,7 +2705,6 @@
     rendezvous = error_handling.ErrorRendezvous(num_sources=3)
     self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
     try:
-      self._profile_logger.log_event('train', 'begin')
       return super(TPUEstimator, self).train(
           input_fn=input_fn,
           hooks=hooks,
@@ -2723,7 +2714,6 @@
     except Exception:  # pylint: disable=broad-except
       rendezvous.record_error('training_loop', sys.exc_info())
     finally:
-      self._profile_logger.log_event('train', 'end')
       rendezvous.record_done('training_loop')
       rendezvous.raise_errors()
 
@@ -2736,7 +2726,6 @@
     rendezvous = error_handling.ErrorRendezvous(num_sources=3)
     self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
     try:
-      self._profile_logger.log_event('eval', 'begin')
       return super(TPUEstimator, self).evaluate(
           input_fn,
           steps=steps,
@@ -2746,7 +2735,6 @@
     except Exception:  # pylint: disable=broad-except
       rendezvous.record_error('evaluation_loop', sys.exc_info())
     finally:
-      self._profile_logger.log_event('eval', 'end')
       rendezvous.record_done('evaluation_loop')
       rendezvous.raise_errors()
 
@@ -2759,7 +2747,6 @@
     rendezvous = error_handling.ErrorRendezvous(num_sources=3)
     self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
     try:
-      self._profile_logger.log_event('predict', 'begin')
       for result in super(TPUEstimator, self).predict(
           input_fn=input_fn,
           predict_keys=predict_keys,
@@ -2770,7 +2757,6 @@
     except Exception:  # pylint: disable=broad-except
       rendezvous.record_error('prediction_loop', sys.exc_info())
     finally:
-      self._profile_logger.log_event('predict', 'end')
       rendezvous.record_done('prediction_loop')
       rendezvous.raise_errors()
 
@@ -2783,8 +2769,6 @@
     def _model_fn(features, labels, mode, config, params):
       """A Estimator `model_fn` for TPUEstimator."""
 
-      self._profile_logger.log_event('model_fn', 'begin')
-
       # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
       # but not in `export_savedmodel()`.
       if self._is_input_fn_invoked:
@@ -2824,7 +2808,6 @@
           if self._log_every_n_steps is not None:
             estimator_spec = estimator_spec._replace(
                 training_hooks=estimator_spec.training_hooks + (examples_hook,))
-          self._profile_logger.log_event('model_fn', 'end')
           return estimator_spec
 
         assert labels is None, '`labels` passed to `model_fn` must be `None`.'
@@ -2841,10 +2824,9 @@
           tpu_init_ops.append(dummy_table_variables_init)
 
         input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        self._profile_logger.log_event('setup_infeed', 'begin')
         enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
-        self._profile_logger.log_event('setup_infeed', 'end')
+
         graph = ops.get_default_graph()
         for enqueue_op in enqueue_ops:
           if isinstance(enqueue_op, list):
@@ -2907,7 +2889,6 @@
                   enqueue_ops,
                   host_ops,
                   tpu_compile_op=compile_op,
-                  prof_logger=self._profile_logger,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator),
                   rendezvous=self._rendezvous[mode],
@@ -2958,7 +2939,6 @@
           train_op = control_flow_ops.group(*update_ops)
           graph.add_to_collection(_TPU_TRAIN_OP, train_op)
 
-          self._profile_logger.log_event('model_fn', 'end')
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=loss,
@@ -3030,7 +3010,6 @@
                   enqueue_ops,
                   eval_update_ops + host_ops,
                   tpu_compile_op=compile_op,
-                  prof_logger=self._profile_logger,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator),
                   rendezvous=self._rendezvous[mode],
@@ -3042,7 +3021,6 @@
           if eval_hooks:
             hooks.extend(eval_hooks)
 
-          self._profile_logger.log_event('model_fn', 'end')
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=mean_loss,
@@ -3111,7 +3089,6 @@
             TPUInfeedOutfeedSessionHookForPrediction(
                 ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
                 tpu_compile_op=compile_op,
-                prof_logger=self._profile_logger,
                 master=self._config.master,
                 session_config=self._session_config),
         ] + input_hooks
@@ -3119,7 +3096,6 @@
         if prediction_hooks:
           hooks.extend(prediction_hooks)
 
-        self._profile_logger.log_event('model_fn', 'end')
         return model_fn_lib.EstimatorSpec(
             mode,
             prediction_hooks=hooks,
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index f9ab5dd..04c96d0 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -382,17 +382,17 @@
 
 def kwarg_only(f):
   """A wrapper that throws away all non-kwarg arguments."""
+  f_argspec = tf_inspect.getargspec(f)
 
   def wrapper(*args, **kwargs):
     if args:
       raise TypeError(
-          '{} only takes keyword args. The following args were provided: {}. '
+          '{f} only takes keyword args (possible keys: {kwargs}). '
           'Please pass these args as kwargs instead.'
-          .format(f.__name__, args))
+          .format(f=f.__name__, kwargs=f_argspec.args))
     return f(**kwargs)
 
-  return tf_decorator.make_decorator(
-      f, wrapper, decorator_argspec=tf_inspect.getargspec(f))
+  return tf_decorator.make_decorator(f, wrapper, decorator_argspec=f_argspec)
 
 
 tf_export = functools.partial(api_export, api_name=TENSORFLOW_API_NAME)
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index c2a4d21..bbc1fba 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -10,6 +10,7 @@
     "tf_additional_xla_deps_py",
     "tf_cuda_tests_tags",
     "tf_exec_compatible_with",
+    "tf_gpu_tests_tags",
     "tf_sycl_tests_tags",
 )
 load(
@@ -908,7 +909,7 @@
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_cc_test(
+def tf_gpu_cc_test(
         name,
         srcs = [],
         deps = [],
@@ -950,18 +951,29 @@
             "//conditions:default": 0,
         }),
         suffix = "_gpu",
-        tags = tags + tf_cuda_tests_tags(),
-        deps = deps + if_cuda([
+        tags = tags + tf_gpu_tests_tags(),
+        deps = deps + if_cuda_is_configured([
+            clean_dep("//tensorflow/core:gpu_runtime"),
+        ]) + if_rocm_is_configured([
             clean_dep("//tensorflow/core:gpu_runtime"),
         ]),
     )
 
 register_extension_info(
+    extension_name = "tf_gpu_cc_test",
+    label_regex_for_dep = "{extension_name}",
+)
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_cc_test(*args, **kwargs):
+    tf_gpu_cc_test(*args, **kwargs)
+
+register_extension_info(
     extension_name = "tf_cuda_cc_test",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_only_cc_test(
+def tf_gpu_only_cc_test(
         name,
         srcs = [],
         deps = [],
@@ -972,7 +984,7 @@
         args = [],
         kernels = [],
         linkopts = []):
-    tags = tags + tf_cuda_tests_tags()
+    tags = tags + tf_gpu_tests_tags()
     native.cc_test(
         name = "%s%s" % (name, "_gpu"),
         srcs = srcs + tf_binary_additional_srcs(),
@@ -1000,6 +1012,15 @@
     )
 
 register_extension_info(
+    extension_name = "tf_gpu_only_cc_test",
+    label_regex_for_dep = "{extension_name}_gpu",
+)
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_only_cc_test(*args, **kwargs):
+    tf_gpu_only_cc_test(*args, **kwargs)
+
+register_extension_info(
     extension_name = "tf_cuda_only_cc_test",
     label_regex_for_dep = "{extension_name}_gpu",
 )
@@ -1080,7 +1101,7 @@
         args = None):
     tf_cc_tests(srcs, deps, linkstatic, size = size, args = args, kernels = kernels, tags = tags)
 
-def tf_cuda_cc_tests(
+def tf_gpu_cc_tests(
         srcs,
         deps,
         name = "",
@@ -1091,7 +1112,7 @@
         kernels = [],
         linkopts = []):
     for src in srcs:
-        tf_cuda_cc_test(
+        tf_gpu_cc_test(
             name = src_to_test_name(src),
             size = size,
             srcs = [src],
@@ -1103,6 +1124,10 @@
             deps = deps,
         )
 
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_cc_tests(*args, **kwargs):
+    tf_gpu_cc_tests(*args, **kwargs)
+
 def tf_java_test(
         name,
         srcs = [],
@@ -1179,7 +1204,7 @@
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
+def tf_gpu_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
     """Generate a cc_library with a conditional set of CUDA dependencies.
 
     When the library is built with --config=cuda:
@@ -1215,6 +1240,15 @@
     )
 
 register_extension_info(
+    extension_name = "tf_gpu_library",
+    label_regex_for_dep = "{extension_name}",
+)
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_library(*args, **kwargs):
+    tf_gpu_library(*args, **kwargs)
+
+register_extension_info(
     extension_name = "tf_cuda_library",
     label_regex_for_dep = "{extension_name}",
 )
@@ -1232,7 +1266,7 @@
         **kwargs):
     """A rule to build a TensorFlow OpKernel.
 
-    May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
+    May either specify srcs/hdrs or prefix.  Similar to tf_gpu_library,
     but with alwayslink=1 by default.  If prefix is specified:
       * prefix*.cc (except *.cu.cc) is added to srcs
       * prefix*.h (except *.cu.h) is added to hdrs
@@ -1303,7 +1337,7 @@
         "req_dep=%s" % clean_dep("//tensorflow/core:gpu_lib"),
         "req_dep=@local_config_cuda//cuda:cuda_headers",
     ]
-    tf_cuda_library(
+    tf_gpu_library(
         name = name,
         srcs = srcs,
         hdrs = hdrs,
@@ -1389,6 +1423,7 @@
     inputs += ctx.files._swiglib
     inputs += ctx.files.toolchain_deps
     swig_include_dirs = depset(_get_repository_roots(ctx, inputs))
+    swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
     args = [
         "-c++",
         "-python",
@@ -1916,7 +1951,7 @@
     label_regex_map = {"additional_deps": "deps:{extension_name}"},
 )
 
-def cuda_py_test(
+def gpu_py_test(
         name,
         srcs,
         size = "medium",
@@ -1941,7 +1976,7 @@
         test_tags = tags
         if config == "gpu":
             test_name += "_gpu"
-            test_tags = test_tags + tf_cuda_tests_tags()
+            test_tags = test_tags + tf_gpu_tests_tags()
         tf_py_test(
             name = test_name,
             size = size,
@@ -1960,6 +1995,15 @@
         )
 
 register_extension_info(
+    extension_name = "gpu_py_test",
+    label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
+)
+
+# terminology changes: saving cuda_* definition for compatibility
+def cuda_py_test(*args, **kwargs):
+    gpu_py_test(*args, **kwargs)
+
+register_extension_info(
     extension_name = "cuda_py_test",
     label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
 )
@@ -2032,7 +2076,7 @@
             xla_enable_strict_auto_jit = xla_enable_strict_auto_jit,
         )
 
-def cuda_py_tests(
+def gpu_py_tests(
         name,
         srcs,
         size = "medium",
@@ -2048,7 +2092,7 @@
     # TODO(b/122522101): Don't ignore xla_enable_strict_auto_jit and enable additional
     # XLA tests once enough compute resources are available.
     _ignored = [xla_enable_strict_auto_jit]
-    test_tags = tags + tf_cuda_tests_tags()
+    test_tags = tags + tf_gpu_tests_tags()
     py_tests(
         name = name,
         size = size,
@@ -2064,6 +2108,10 @@
         xla_enable_strict_auto_jit = False,
     )
 
+# terminology changes: saving cuda_* definition for compatibility
+def cuda_py_tests(*args, **kwargs):
+    gpu_py_tests(*args, **kwargs)
+
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
 #
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 853054a..9aa5955 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -113,10 +113,6 @@
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
-    name: "filter_for_shard"
-    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-estimator.pbtxt
new file mode 100644
index 0000000..0fdbecb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-estimator.pbtxt
@@ -0,0 +1,76 @@
+path: "tensorflow.estimator.BoostedTreesEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'head\', \'model_dir\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_feature_importances"
+    argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "experimental_predict_with_explanations"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
index b247485..9eb3ccf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -21,10 +21,6 @@
     argspec: "args=[\'logit_fn\', \'features\', \'mode\', \'params\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "dnn_logit_fn_builder"
-    argspec: "args=[\'units\', \'hidden_units\', \'feature_columns\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'batch_norm\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
     name: "linear_logit_fn_builder"
     argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], "
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
index 6f57505..43a5e97 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
@@ -21,6 +21,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "BoostedTreesEstimator"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 08b7657..6957568 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -113,10 +113,6 @@
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
-    name: "filter_for_shard"
-    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-estimator.pbtxt
new file mode 100644
index 0000000..0fdbecb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-estimator.pbtxt
@@ -0,0 +1,76 @@
+path: "tensorflow.estimator.BoostedTreesEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'head\', \'model_dir\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_feature_importances"
+    argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "experimental_predict_with_explanations"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
index 2ec46d5..a4e9b56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
index d1b29d6..cfb4925 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index d57df9e..89029a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
index b785272..d810542 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'head\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'input_layer_partitioner\', \'config\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'sum\'], "
+    argspec: "args=[\'self\', \'head\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'config\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'None\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index 4c5abd9..a158cd0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 0bd0382..cb6c882 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
index 5434d58..5c0e8f5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
index 66a1276..e562bf1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], "
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
index 42fed17..5df9361 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
@@ -21,7 +21,7 @@
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'sum_over_batch_size\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
index b1bd5a2..de0470b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -18,7 +18,7 @@
   }
   member_method {
     name: "dnn_logit_fn_builder"
-    argspec: "args=[\'units\', \'hidden_units\', \'feature_columns\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'batch_norm\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'units\', \'hidden_units\', \'feature_columns\', \'activation_fn\', \'dropout\', \'batch_norm\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "linear_logit_fn_builder"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
index add8ef5..a7b72d6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
@@ -21,6 +21,10 @@
     mtype: "<type \'type\'>"
   }
   member {
+    name: "BoostedTreesEstimator"
+    mtype: "<type \'type\'>"
+  }
+  member {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index fb89501..c9b250b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.GRU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.UnifiedGRU\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent_v2.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index aee27ad..4f07fad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.LSTM"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.UnifiedLSTM\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent_v2.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.DropoutRNNCellMixin\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 5a3326f..6cdb405 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -1472,6 +1472,78 @@
         "tf.nn.fractional_avg_pool": _pool_seed_transformer,
         "tf.nn.fractional_max_pool": _pool_seed_transformer,
         "tf.name_scope": _name_scope_transformer,
+        "tf.estimator.DNNEstimator":
+            functools.partial(
+                _rename_if_arg_found_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNEstimator no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNClassifier":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNClassifier no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNRegressor":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNRegressor no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.LinearEstimator":
+            functools.partial(
+                _rename_if_arg_found_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.LinearEstimator no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.LinearClassifier":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.LinearClassifier no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.LinearRegressor":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.LinearRegressor no longer takes "
+                "input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNLinearCombinedEstimator":
+            functools.partial(
+                _rename_if_arg_found_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNLinearCombinedEstimator no longer "
+                "takes input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNLinearCombinedClassifier":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNLinearCombinedClassifier no longer "
+                "takes input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
+        "tf.estimator.DNNLinearCombinedRegressor":
+            functools.partial(
+                _rename_if_arg_found_and_add_loss_reduction_transformer,
+                arg_name="input_layer_partitioner",
+                message="tf.estimator.DNNLinearCombinedRegressor no longer "
+                "takes input_layer_partitioner, so the call was converted to "
+                "compat.v1."
+            ),
         "tf.device": functools.partial(
             _rename_if_arg_found_transformer, arg_name="device_name",
             arg_ok_predicate=_is_ast_str, remove_if_ok=False,
@@ -1514,14 +1586,6 @@
         "tf.contrib.summary.histogram": _add_summary_step_transformer,
         "tf.contrib.summary.image": _add_summary_step_transformer,
         "tf.contrib.summary.scalar": _add_summary_step_transformer,
-        "tf.estimator.LinearClassifier": _add_loss_reduction_transformer,
-        "tf.estimator.LinearRegressor": _add_loss_reduction_transformer,
-        "tf.estimator.DNNLinearCombinedClassifier":
-            _add_loss_reduction_transformer,
-        "tf.estimator.DNNLinearCombinedRegressor":
-            _add_loss_reduction_transformer,
-        "tf.estimator.DNNRegressor": _add_loss_reduction_transformer,
-        "tf.estimator.DNNClassifier": _add_loss_reduction_transformer,
         "tf.estimator.BaselineClassifier": _add_loss_reduction_transformer,
         "tf.estimator.BaselineRegressor": _add_loss_reduction_transformer,
         "tf.initializers.uniform_unit_scaling":
@@ -1974,6 +2038,45 @@
   return node
 
 
+def _rename_if_arg_found_and_add_loss_reduction_transformer(
+    parent,
+    node,
+    full_name,
+    name,
+    logs,
+    arg_name=None,
+    arg_ok_predicate=None,
+    remove_if_ok=False,
+    message=None):
+  """Combination of _rename_if_arg_found and _add_loss_reduction transformers.
+
+  Args:
+    parent: Parent of node.
+    node: ast.Call node to maybe modify.
+    full_name: full name of function to modify
+    name: name of function to modify
+    logs: list of logs to append to
+    arg_name: name of the argument to look for
+    arg_ok_predicate: predicate callable with the ast of the argument value,
+      returns whether the argument value is allowed.
+    remove_if_ok: remove the argument if present and ok as determined by
+      arg_ok_predicate.
+    message: message to print if a non-ok arg is found (and hence, the function
+      is renamed to its compat.v1 version).
+
+  Returns:
+    node, if it was modified, else None.
+  """
+
+  add_loss_node = _add_loss_reduction_transformer(parent, node, full_name, name,
+                                                  logs)
+  rename_node = _rename_if_arg_found_transformer(
+      parent, add_loss_node, full_name, name, logs, arg_name, arg_ok_predicate,
+      remove_if_ok, message)
+
+  return rename_node
+
+
 def _add_uniform_scaling_initializer_transformer(
     parent, node, full_name, name, logs):
   """Updates references to uniform_unit_scaling_initializer.
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 0c46580..c68bc5f 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -518,6 +518,32 @@
     _, report, errors, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
+  def testBaseEstimatorPartitioner(self):
+    classes = ["LinearEstimator", "DNNLinearCombinedEstimator", "DNNEstimator"]
+    for c in classes:
+      ns = "tf.estimator." + c
+      suffix = "(input_layer_partitioner=TEST)"
+      text = ns + suffix
+      expected_text = "tf.compat.v1.estimator." + c + suffix
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(new_text, expected_text)
+
+  def testCannedEstimatorPartitioner(self):
+    classes = [
+        "LinearClassifier", "LinearRegressor", "DNNLinearCombinedClassifier",
+        "DNNLinearCombinedRegressor", "DNNRegressor", "DNNClassifier"
+    ]
+
+    for c in classes:
+      ns = "tf.estimator." + c
+      suffix = "(input_layer_partitioner=TEST)"
+      text = ns + suffix
+      suffix = ("(input_layer_partitioner=TEST, "
+                "loss_reduction=tf.compat.v1.losses.Reduction.SUM)")
+      expected_text = "tf.compat.v1.estimator." + c + suffix
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(new_text, expected_text)
+
   def testExtractGlimpse(self):
     text = ("tf.image.extract_glimpse(x, size, off, False, "
             "False, False, name=\"foo\")\n")
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 96cfda7..8700f9c 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -133,11 +133,11 @@
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "c74a9a596999c5ed6a3c816a99ad09d3004109e000e6f86d6fdc291c4fdc8120",
-        strip_prefix = "abseil-cpp-88a152ae747c3c42dc9167d46c590929b048d436",
+        sha256 = "bc275e1e4642b7e6d4e867b999886971bf0e75b57cb7ad723ffeeb3d595924d7",
+        strip_prefix = "abseil-cpp-256be563447a315f2a7993ec669460ba475fa86a",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/88a152ae747c3c42dc9167d46c590929b048d436.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/88a152ae747c3c42dc9167d46c590929b048d436.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/256be563447a315f2a7993ec669460ba475fa86a.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/256be563447a315f2a7993ec669460ba475fa86a.tar.gz",
         ],
     )
 
@@ -518,11 +518,11 @@
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "9ce81d8dff409a34172fee0e834410a463cb8381c9d6ef3f02e9f0521d7cac9e",
-        strip_prefix = "llvm-f3ff35c66b59f9ba0fe7d59874e7dac6a828c0a4",
+        sha256 = "015878aac86e9afb2db9b8877b814fbcb146a7ee2b79bfce9fd99b6c3d076d7b",
+        strip_prefix = "llvm-d13e5110081a68cacd856d3a3c7860996d67679d",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/f3ff35c66b59f9ba0fe7d59874e7dac6a828c0a4.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/f3ff35c66b59f9ba0fe7d59874e7dac6a828c0a4.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/d13e5110081a68cacd856d3a3c7860996d67679d.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/d13e5110081a68cacd856d3a3c7860996d67679d.tar.gz",
         ],
     )
 
diff --git a/third_party/icu/udata.patch b/third_party/icu/udata.patch
index d6d5910..2af6718 100644
--- a/third_party/icu/udata.patch
+++ b/third_party/icu/udata.patch
@@ -1,3 +1,18 @@
+--- /icu4c/source/common/unicode/uconfig.h	2018-06-19 22:34:56.000000000 -0700
++++ /ice4c/source/common/unicode/uconfig.h.new	2019-03-12 10:12:35.896095657 -0700
+@@ -55,6 +55,11 @@
+ #include "uconfig_local.h"
+ #endif
+ 
++// Tensorflow is statically linked on all platforms.
++#ifndef U_STATIC_IMPLEMENTATION
++#define U_STATIC_IMPLEMENTATION
++#endif
++
+ /**
+  * \def U_DEBUG
+  * Determines whether to include debugging code.
+
 --- /icu4c/source/common/udata.cpp.old	2018-06-19 22:34:56.000000000 -0700
 +++ /icu4c/source/common/udata.cpp	2018-10-19 14:26:09.778950855 -0700
 @@ -18,15 +18,15 @@