Automated g4 rollback of changelist 179260538 PiperOrigin-RevId: 179263865

commit: 90e42f3ac8c43474633136af4242dca04b6a1e09 [log] [tgz]
author: Dandelion Man? <dandelion@google.com> Fri Dec 15 18:15:07 2017 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> Fri Dec 15 18:19:09 2017 -0800
tree: 64dbb44252c89c847bee86db07cea5aa94072e7c
parent: 713d45278491d792c525344de6038a61ebcb2136 [diff]
diff --git a/.gitignore b/.gitignore
index d11a504..be75938 100644
--- a/.gitignore
+++ b/.gitignore

@@ -6,7 +6,7 @@
 /bazel-*
 /bazel_pip
 /tools/python_bin_path.sh
-/tools/git/gen
+/tensorflow/tools/git/gen
 /pip_test
 /_python_build
 *.pyc
@@ -26,4 +26,11 @@
 /tensorflow/contrib/lite/gen/**
 /tensorflow/contrib/lite/examples/ios/simple/data/*.txt
 /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
-xcuserdata/**
\ No newline at end of file
+xcuserdata/**
+
+# Android
+.gradle
+.idea
+*.iml
+local.properties
+gradleBuild

diff --git a/configure.py b/configure.py
index 3d553e1..3365595 100644
--- a/configure.py
+++ b/configure.py

@@ -36,8 +36,8 @@
                            '.tf_configure.bazelrc')
 _TF_WORKSPACE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              'WORKSPACE')
-_DEFAULT_CUDA_VERSION = '8.0'
-_DEFAULT_CUDNN_VERSION = '6'
+_DEFAULT_CUDA_VERSION = '9.0'
+_DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
@@ -1096,6 +1096,27 @@
   write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH',
                               computecpp_toolkit_path)
 
+def set_trisycl_include_dir(environ_cp):
+  """Set TRISYCL_INCLUDE_DIR"""
+  ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
+                             'include directory. (Use --config=sycl_trisycl '
+                             'when building with Bazel) '
+                             '[Default is %s]: '
+                             ) % (_DEFAULT_TRISYCL_INCLUDE_DIR)
+  while True:
+    trisycl_include_dir = get_from_env_or_user_or_default(
+      environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
+      _DEFAULT_TRISYCL_INCLUDE_DIR)
+    if os.path.exists(trisycl_include_dir):
+      break
+
+    print('Invalid triSYCL include directory, %s cannot be found'
+          % (trisycl_include_dir))
+
+  # Set TRISYCL_INCLUDE_DIR
+  environ_cp['TRISYCL_INCLUDE_DIR'] = trisycl_include_dir
+  write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR',
+                              trisycl_include_dir)
 
 def set_trisycl_include_dir(environ_cp):
   """Set TRISYCL_INCLUDE_DIR."""
@@ -1211,6 +1232,15 @@
 def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
+def set_windows_build_flags():
+  if is_windows():
+    # The non-monolithic build is not supported yet
+    write_to_bazelrc('build --config monolithic')
+    # Suppress warning messages
+    write_to_bazelrc('build --copt=-w --host_copt=-w')
+    # Output more verbose information when something goes wrong
+    write_to_bazelrc('build --verbose_failures')
+
 
 def main():
   # Make a copy of os.environ to be clear when functions and getting and setting
@@ -1289,6 +1319,7 @@
   set_cc_opt_flags(environ_cp)
   set_mkl()
   set_monolithic()
+  set_windows_build_flags()
   create_android_bazelrc_configs()
 
   if workspace_has_any_android_rule():

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 6f5abd0..9b57047 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc

@@ -579,6 +579,7 @@
       status->status = InvalidArgument(
           "invalid string tensor encoding (string #", i, " of ",
           srcarray.size(), "): ", status->status.error_message());
+      delete[] base;
       return nullptr;
     }
     dst += consumed;
@@ -588,6 +589,7 @@
     status->status = InvalidArgument(
         "invalid string tensor encoding (decoded ", (dst - base),
         " bytes, but the tensor is encoded in ", size, " bytes");
+    delete[] base;
     return nullptr;
   }
 

diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index b9312c2..d60d1de 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc

@@ -68,7 +68,7 @@
   // This is a superset of values in name_mapping_.
   std::unordered_set<string> used_names_;
   // Mapping from original node name from the graph to the normalized
-  // and uniqified version of it.
+  // and uniquified version of it.
   std::unordered_map<string, string> name_mapping_;
 };
 
@@ -236,7 +236,7 @@
 }
 
 // Graph to FunctionDef conversion. This code is closely modeled on the Python
-// code in third_party/tensorflow/python/framework/function.py.
+// code in tensorflow/python/framework/function.py.
 Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
                           bool append_hash_to_fn_name,
                           const std::vector<const Node*>& body_nodes,

diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 351fda2..03c2235 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc

@@ -311,6 +311,32 @@
 
     xla::ComputationDataHandle ta = resource->value;
 
+    // Look for the case where the gather takes a simple slice from the
+    // tensor array (0, 1, 2, 3, 4, ..., N)
+    std::vector<int64> const_indices;
+    Status status = ctx->ConstantInputAsIntVector(1, &const_indices);
+    if (status.ok()) {
+      bool gather_is_dense_slice = true;
+      for (auto i = 0; i < const_indices.size(); i++) {
+        if (const_indices[i] != i) {
+          gather_is_dense_slice = false;
+          break;
+        }
+      }
+
+      if (gather_is_dense_slice) {
+        std::vector<int64> begin(ta_shape.dims(), 0);
+        std::vector<int64> strides(ta_shape.dims(), 1);
+        std::vector<int64> end(ta_shape.dims(), 1);
+        end[0] = const_indices.size();
+        for (auto i = 1; i < ta_shape.dims(); i++) {
+          end[i] = ta_shape.dim_size(i);
+        }
+        ctx->SetOutput(0, b->Slice(ta, begin, end, strides));
+        return;
+      }
+    }
+
     xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
         ctx, ta, ta_shape, indices, indices_shape, 0, dtype_, index_type, b);
     ctx->SetOutput(0, gather);
@@ -352,28 +378,47 @@
     const xla::ComputationDataHandle value = ctx->Input(2);
     const xla::ComputationDataHandle flow = ctx->Input(3);
 
-    auto slice_dims = value_shape.dim_sizes();
-    slice_dims[0] = 1LL;
+    // Look for the case where the scatter is for each sub-tensor in order. The
+    // tensor array implementation allows for this to be a straight addition.
+    bool scatter_all_elements_in_order = false;
+    std::vector<int64> const_indices;
+    Status status = ctx->ConstantInputAsIntVector(1, &const_indices);
+    if (status.ok() && num_indices == value_shape.dim_size(0)) {
+      scatter_all_elements_in_order = true;
+      for (auto i = 0; i < num_indices; i++) {
+        if (const_indices[i] != i) {
+          scatter_all_elements_in_order = false;
+          break;
+        }
+      }
+    }
 
-    std::vector<int64> value_starts(value_shape.dims(), 0);
-    auto value_ends = value_shape.dim_sizes();
+    if (scatter_all_elements_in_order) {
+      ta = b->Add(ta, value);
+    } else {
+      auto slice_dims = value_shape.dim_sizes();
+      slice_dims[0] = 1LL;
 
-    std::vector<int64> value_strides(value_shape.dims(), 1);
+      std::vector<int64> value_starts(value_shape.dims(), 0);
+      auto value_ends = value_shape.dim_sizes();
 
-    // For every (index, value) pair, update the corresponding TensorArray
-    // storage.
-    for (int i = 0; i < num_indices; ++i) {
-      // Slice out part of the value.
-      value_starts[0] = i;
-      value_ends[0] = i + 1;
-      auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+      std::vector<int64> value_strides(value_shape.dims(), 1);
 
-      // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-      auto index = b->Slice(indices, {i}, {i + 1}, {1});
-      auto start_indices =
-          b->Pad(b->Reshape(index, {1}), b->ConstantR0<int32>(0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
-      ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+      // For every (index, value) pair, update the corresponding TensorArray
+      // storage.
+      for (int i = 0; i < num_indices; ++i) {
+        // Slice out part of the value.
+        value_starts[0] = i;
+        value_ends[0] = i + 1;
+        auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+
+        // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+        auto index = b->Slice(indices, {i}, {i + 1}, {1});
+        auto start_indices =
+                b->Pad(b->Reshape(index, {1}), b->ConstantR0<int32>(0),
+                       xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+        ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+      }
     }
 
     resource->value = ta;

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 61f7821..604c41b 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD

@@ -9,7 +9,12 @@
 
 py_library(
     name = "contrib_py",
-    srcs = glob(["**/*.py"]),
+    srcs = glob(
+        ["**/*.py"],
+        exclude = [
+            "**/*_test.py",
+        ],
+    ),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 77a3fc0..481caf6 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt

@@ -18,7 +18,6 @@
 
 # Options
 option(tensorflow_VERBOSE "Enable for verbose output" OFF)
-option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
@@ -34,6 +33,12 @@
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
+
+# GPU, CUDA and cuDNN options
+option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
+option(tensorflow_CUDA_VERSION "CUDA version to build against" 9.0)
+option(tensorflow_CUDNN_VERSION "cuDNN version to build against" 7)
+
 if(HAIKU)
 	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
 else()
@@ -262,7 +267,7 @@
     list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
   endif (NOT WIN32)
 
-  find_package(CUDA 8.0 REQUIRED)
+  find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED)
 
   # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
   # CUDA_NVCC_FLAGS and cuda_config.h below
@@ -316,13 +321,16 @@
       ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
   endif (WIN32)
 
+  # Remove "." from CUDA version variable.
+  string(REPLACE "." "" short_CUDA_VER ${tensorflow_CUDA_VERSION})
+
   # create cuda_config.h
   FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
     "#ifndef CUDA_CUDA_CONFIG_H_\n"
     "#define CUDA_CUDA_CONFIG_H_\n"
     "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
-    "#define TF_CUDA_VERSION \"64_80\"\n"
-    "#define TF_CUDNN_VERSION \"64_6\"\n"
+    "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n"
+    "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n"
     "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
     "#endif  // CUDA_CUDA_CONFIG_H_\n"
   )
@@ -360,15 +368,15 @@
   if(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
       msvcp_dll_name=msvcp140.dll
-      cudart_dll_name=cudart64_80.dll
-      cuda_version_number=8.0
+      cudart_dll_name=cudart64_${short_CUDA_VER}.dll
+      cuda_version_number=${tensorflow_CUDA_VERSION}
       nvcuda_dll_name=nvcuda.dll
-      cudnn_dll_name=cudnn64_6.dll
-      cudnn_version_number=6)
+      cudnn_dll_name=cudnn64_${tensorflow_CUDNN_VERSION}.dll
+      cudnn_version_number=${tensorflow_CUDNN_VERSION})
   else(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
-      cuda_version_number=8.0
-      cudnn_version_number=6)
+	    cuda_version_number=${tensorflow_CUDA_VERSION}
+	    cudnn_version_number=${tensorflow_CUDNN_VERSION})
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
   set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
@@ -383,11 +391,7 @@
 
 # Let's get to work!
 include(tf_core_framework.cmake)
-# NOTE: Disabled until issue #3996 is fixed.
-# include(tf_stream_executor.cmake)
-if (tensorflow_ENABLE_GPU)
-    include(tf_stream_executor.cmake)
-endif()
+include(tf_stream_executor.cmake)
 
 include(tf_core_cpu.cmake)
 include(tf_core_ops.cmake)

diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 4ddfec5..4be733a 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md

@@ -19,23 +19,6 @@
 ### Current known limitations
 * It is not possible to load a custom Op library.
 * GCS file system is not supported.
-* The following Ops are not currently implemented:
- - Dequantize
- - QuantizeAndDequantize
- - QuantizedAvgPool
- - QuantizedBatchNomWithGlobalNormalization
- - QuantizedBiasAdd
- - QuantizedConcat
- - QuantizedConv2D
- - QuantizedMatmul
- - QuantizedMaxPoo
- - QuantizeDownAndShrinkRange
- - QuantizedRelu
- - QuantizedRelu6
- - QuantizedReshape
- - QuantizeV2
- - RequantizationRange
- - Requantize
 
 ## Building with CMake
 

diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index 3b14665..a235442 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake

@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(gemmlowp_URL https://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip)
-set(gemmlowp_HASH SHA256=dd2557072bde12141419cb8320a9c25e6ec41a8ae53c2ac78c076a347bb46d9d)
+set(gemmlowp_URL https://github.com/google/gemmlowp/archive/6a2a90822e8546fc2bfa7044de0faf1c1cb4862f.zip)
+set(gemmlowp_HASH SHA256=3447948d219f3270383766bbe08942888c0eb4e0ca6663c0e0548502ec5bb77d)
 set(gemmlowp_BUILD ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 set(gemmlowp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 

diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 2d01590..eb6bf56 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake

@@ -150,9 +150,6 @@
 if(WIN32)
   file(GLOB_RECURSE tf_core_kernels_windows_exclude_srcs
       # not working on windows yet
-      "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/neon/*"
       # not in core - those are loaded dynamically as dll
       "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc"

diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 9ed5b4b..94ca4b0 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake

@@ -146,6 +146,8 @@
     "${tensorflow_source_dir}/tensorflow/contrib/estimator/python/estimator/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py"
     "${tensorflow_source_dir}/tensorflow/python/meta_graph_transform/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/ops/quantized_conv_ops_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/ops/quantized_ops_test.py"
     "${tensorflow_source_dir}/tensorflow/python/platform/build_info_test.py"
     "${tensorflow_source_dir}/tensorflow/python/profiler/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/profiler/internal/*_test.py"

diff --git a/tensorflow/contrib/factorization/python/ops/gmm.py b/tensorflow/contrib/factorization/python/ops/gmm.py
index 0d67e09..f72280c 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm.py

@@ -24,7 +24,7 @@
 from tensorflow.contrib import framework
 from tensorflow.contrib.factorization.python.ops import gmm_ops
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
@@ -167,7 +167,7 @@
                                      self._num_clusters, self._random_seed,
                                      self._covariance_type,
                                      self._params)
-      incr_step = state_ops.assign_add(variables.get_global_step(), 1)
+      incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
       loss = math_ops.reduce_sum(losses)
       training_op = with_dependencies([training_op, incr_step], loss)
       training_hooks = [_InitializeClustersHook(

diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 484ffee..daba965 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py

@@ -28,6 +28,7 @@
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
+from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 

diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 1245f51..1e8af14 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc

@@ -49,7 +49,8 @@
           "-nostdin",             // No interactive commands accepted.
           "-f", input_format_id,  // eg: "mp3"
           "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
-          "-loglevel", "info",  // Enable verbose logging to support debugging.
+          "-loglevel", "error",   // Print errors only.
+          "-hide_banner",         // Skip printing build options, version, etc.
           "-map_metadata", "-1",  // Copy global metadata from input to output.
           "-vn",                  // No video recording.
           "-ac:a:0", StrCat(channel_count), "-ar:a:0",
@@ -72,7 +73,8 @@
           "-probesize",
           StrCat(kDefaultProbeSize),
           "-loglevel",
-          "info",  // Enable verbose logging to support debugging.
+          "error",  // Print errors only.
+          "-hide_banner",  // Skip printing build options, version, etc.
           "-vcodec",
           "rawvideo",
           "-pix_fmt",

diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
index d6c885a..36fc717 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc

@@ -20,6 +20,8 @@
 #include <string>
 #include <vector>
 
+
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"

diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 5bb011f..08b5a6e 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py

@@ -21,6 +21,7 @@
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
+from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader

diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index eef66af..d3dca3d 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py

@@ -107,6 +107,7 @@
                discriminator_loss_fn=None,
                generator_optimizer=None,
                discriminator_optimizer=None,
+               get_hooks_fn=None,
                add_summaries=None,
                use_loss_summaries=True,
                config=None):
@@ -137,6 +138,10 @@
         work.
       discriminator_optimizer: Same as `generator_optimizer`, but for the
         discriminator updates.
+      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+        list of hooks. These hooks are run on the generator and discriminator
+        train ops, and can be used to implement the GAN training scheme.
+        Defaults to `train.get_sequential_train_hooks()`.
       add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
@@ -151,7 +156,7 @@
               else discriminator_optimizer)
       gan_head = head_lib.gan_head(
           generator_loss_fn, discriminator_loss_fn, gopt, dopt,
-          use_loss_summaries)
+          use_loss_summaries, get_hooks_fn=get_hooks_fn)
       return _gan_model_fn(
           features, labels, mode, generator_fn, discriminator_fn, gan_head,
           add_summaries)

diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index 204c646..a21358c 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py

@@ -71,7 +71,7 @@
   def __init__(self, generator_loss_fn, discriminator_loss_fn,
                generator_optimizer, discriminator_optimizer,
                use_loss_summaries=True,
-               get_hooks_fn=tfgan_train.get_sequential_train_hooks(),
+               get_hooks_fn=None,
                name=None):
     """`Head` for GAN training.
 
@@ -86,10 +86,12 @@
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
       get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks.
+        of hooks. Defaults to `train.get_sequential_train_hooks()`
       name: name of the head. If provided, summary and metrics keys will be
         suffixed by `"/" + name`.
     """
+    if get_hooks_fn is None:
+      get_hooks_fn = tfgan_train.get_sequential_train_hooks()
     # TODO(joelshor): Validate inputs.
 
     if use_loss_summaries in [True, False]:

diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index b12a882..51610f2 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py

@@ -79,7 +79,8 @@
   ```
 
   * To get [Delving Deep into Rectifiers](
-     http://arxiv.org/pdf/1502.01852v1.pdf), use (Default):<br/>
+     http://arxiv.org/pdf/1502.01852v1.pdf) (also know as the "MSRA 
+     initialization"), use (Default):<br/>
     `factor=2.0 mode='FAN_IN' uniform=False`
   * To get [Convolutional Architecture for Fast Feature Embedding](
      http://arxiv.org/abs/1408.5093), use:<br/>

diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
index 1475096..ef5e620 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py

@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import composable_model
@@ -55,7 +55,7 @@
     raise NotImplementedError
 
   def _train_op_fn(loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     assert global_step
     train_step = model.get_train_step(loss)
 

diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index cb15ef2..c17b41c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py

@@ -23,7 +23,7 @@
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -189,7 +189,7 @@
       """Returns the op to optimize the loss."""
       return optimizers.optimize_loss(
           loss=loss,
-          global_step=contrib_variables.get_global_step(),
+          global_step=training_util.get_global_step(),
           learning_rate=_LEARNING_RATE,
           optimizer=_get_optimizer(optimizer),
           gradient_multipliers=(

diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
index 248c6c7..9d7c1a0 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py

@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import models
@@ -114,7 +114,7 @@
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
       loss,
-      variables.get_global_step(),
+      training_util.get_global_step(),
       optimizer='Adagrad',
       learning_rate=params['learning_rate'])
   return prediction, loss, train_op
@@ -129,7 +129,7 @@
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -139,7 +139,7 @@
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -150,7 +150,7 @@
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction

diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index be2b0cb..2a13a84 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py

@@ -32,7 +32,7 @@
 
 from tensorflow.contrib import learn
 from tensorflow.contrib import lookup
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import experiment
@@ -132,7 +132,7 @@
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
       loss,
-      variables.get_global_step(),
+      training_util.get_global_step(),
       optimizer='Adagrad',
       learning_rate=params['learning_rate'])
   return prediction, loss, train_op
@@ -147,7 +147,7 @@
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -157,7 +157,7 @@
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -168,7 +168,7 @@
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -241,7 +241,7 @@
     const = constant_op.constant(-1, dtype=dtypes.int64)
     table = lookup.MutableHashTable(
         dtypes.string, dtypes.int64, const, name='LookupTableModel')
-    update_global_step = variables.get_global_step().assign_add(1)
+    update_global_step = training_util.get_global_step().assign_add(1)
     if mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL):
       key = constant_op.constant(['key'])
       value = constant_op.constant([42], dtype=dtypes.int64)
@@ -306,7 +306,7 @@
         mode=mode,
         predictions=constant_op.constant(0.),
         loss=constant_op.constant(0.),
-        train_op=variables.get_global_step().assign_add(1))
+        train_op=training_util.get_global_step().assign_add(1))
 
 
 def _make_input_fn(features, labels):
@@ -389,7 +389,7 @@
       self.assertEqual(expected_param, params)
       self.assertEqual(model_dir, expected_model_dir)
       return (constant_op.constant(0.), constant_op.constant(0.),
-              variables.get_global_step().assign_add(1))
+              training_util.get_global_step().assign_add(1))
     est = estimator.Estimator(model_fn=_argument_checker,
                               params=expected_param,
                               model_dir=expected_model_dir)
@@ -400,7 +400,7 @@
     def _invalid_model_fn(features, labels):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         loss = 100.0 - w
       return None, loss, None
@@ -415,7 +415,7 @@
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         train_op = w.assign_add(loss / 100.0)
       predictions = loss
@@ -434,7 +434,7 @@
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         train_op = w.assign_add(loss / 100.0)
       return None, loss, train_op
@@ -464,7 +464,7 @@
           mode=mode,
           predictions=constant_op.constant(0.),
           loss=constant_op.constant(0.),
-          train_op=variables.get_global_step().assign_add(1),
+          train_op=training_util.get_global_step().assign_add(1),
           scaffold=monitored_session.Scaffold(init_fn=_init_fn))
 
     est = estimator.Estimator(model_fn=_model_fn_scaffold)
@@ -483,7 +483,7 @@
           mode=mode,
           predictions=constant_op.constant([[1.]]),
           loss=constant_op.constant(0.),
-          train_op=variables.get_global_step().assign_add(1),
+          train_op=training_util.get_global_step().assign_add(1),
           scaffold=monitored_session.Scaffold(saver=self.mock_saver))
 
     def input_fn():

diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
index 1d89dfb..8131e0f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py

@@ -22,7 +22,7 @@
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -62,7 +62,7 @@
       _ = labels
       predictions = features["transformed_x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
@@ -100,7 +100,7 @@
       _ = labels
       predictions = features["x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
@@ -139,7 +139,7 @@
       _ = labels
       predictions = features["x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator_with_fe_fn = estimator_lib.Estimator(

diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index 992b804..8f9d6fc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py

@@ -28,7 +28,7 @@
 import numpy as np
 
 from tensorflow.contrib.factorization.python.ops import clustering_ops
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModelFnOps
 from tensorflow.python.framework import ops
@@ -128,7 +128,7 @@
        random_seed=params.get('random_seed'),
        kmeans_plus_plus_num_retries=params.get(
            'kmeans_plus_plus_num_retries')).training_graph()
-  incr_step = state_ops.assign_add(variables.get_global_step(), 1)
+  incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
   loss = math_ops.reduce_sum(losses, name=KMeansClustering.LOSS_OP_NAME)
   summary.scalar('loss/raw', loss)
   training_op = with_dependencies([training_op, incr_step], loss)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index f5445ad..37aa8b3 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py

@@ -26,7 +26,7 @@
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
@@ -170,7 +170,7 @@
           weight_collections=[parent_scope])
 
     def _train_op_fn(loss):
-      global_step = contrib_variables.get_global_step()
+      global_step = training_util.get_global_step()
       my_vars = ops.get_collection(parent_scope)
       grads = gradients.gradients(loss, my_vars)
       if gradient_clip_norm:
@@ -252,7 +252,7 @@
     _add_bias_column(feature_columns, features, bias, columns_to_variables)
 
   def _train_op_fn(unused_loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     sdca_model, train_op = optimizer.get_train_step(columns_to_variables,
                                                     weight_column_name,
                                                     loss_type, features,

diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
index 93c62f8..656d68b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py

@@ -21,7 +21,7 @@
 import numpy as np
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import logistic_regressor
@@ -57,7 +57,7 @@
   predictions = math_ops.sigmoid(logits)
   loss = losses.sigmoid_cross_entropy(labels, logits)
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return predictions, loss, train_op
 
 

diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py
index 6af2287..cb34cb1 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export.py

@@ -20,7 +20,7 @@
 from __future__ import print_function
 
 from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.session_bundle import exporter
 from tensorflow.contrib.session_bundle import gc
 from tensorflow.python.client import session as tf_session
@@ -78,7 +78,7 @@
           default_graph_signature=default_graph_signature,
           named_graph_signatures=named_graph_signatures,
           assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))
-      return export.export(export_dir, contrib_variables.get_global_step(),
+      return export.export(export_dir, training_util.get_global_step(),
                            session, exports_to_keep=exports_to_keep)
 
 
@@ -295,7 +295,7 @@
   checkpoint_path = (checkpoint_path or
                      tf_saver.latest_checkpoint(estimator._model_dir))
   with ops.Graph().as_default() as g:
-    contrib_variables.create_global_step(g)
+    training_util.create_global_step(g)
 
     if use_deprecated_input_fn:
       examples = array_ops.placeholder(dtype=dtypes.string,

diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index 701fc1c..05794a4 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py

@@ -19,7 +19,7 @@
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
@@ -154,7 +154,7 @@
     _add_bias_column(feature_columns, features, bias, columns_to_variables)
 
   def _train_op_fn(unused_loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     sdca_model, train_op = optimizer.get_train_step(
         columns_to_variables, weight_column_name, loss_type, features, labels,
         global_step)

diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index fc9144d..2fb4007 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md

@@ -167,6 +167,7 @@
 This frozen Graphdef is now ready to be converted to flatbuffer format (.lite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
 
 Here is a sample command line to convert the frozen Graphdef to '.lite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
+(Here is a link to the pb [file](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)).
 
 ```
 bazel build tensorflow/contrib/lite/toco:toco
@@ -215,3 +216,7 @@
 
 ### For iOS
 Follow the documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md) to get integrate a TFLite model into your app.
+
+## Core ML support
+
+Core ML is a machine learning framework used across Apple products. In addition to using Tensorflow Lite models directly in their applications, developers have the option to convert their trained Tensorflow models to the [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple devices. For information on how to use the converter please refer to the [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).

diff --git a/tensorflow/contrib/lite/ios_makefile.inc b/tensorflow/contrib/lite/ios_makefile.inc
index 345ed26..bcff7ed 100644
--- a/tensorflow/contrib/lite/ios_makefile.inc
+++ b/tensorflow/contrib/lite/ios_makefile.inc

@@ -1,31 +1,47 @@
-#Settings for iOS.
-ifeq($(TARGET), IOS) BUILD_FOR_IOS_SIMULATOR
-    : = false ifeq($(IOS_ARCH), x86_64) BUILD_FOR_IOS_SIMULATOR
-    : = true endif ifeq($(IOS_ARCH), i386) BUILD_FOR_IOS_SIMULATOR
-    : = true endif ifeq($(BUILD_FOR_IOS_SIMULATOR), true) IPHONEOS_PLATFORM
-    : = $(shell xcrun-- sdk iphonesimulator-- show - sdk - platform -
-          path) IPHONEOS_SYSROOT
-    : = $(shell xcrun-- sdk iphonesimulator-- show - sdk -
-          path) else IPHONEOS_PLATFORM
-    : = $(shell xcrun-- sdk iphoneos-- show - sdk - platform -
-          path) IPHONEOS_SYSROOT
-    : = $(shell xcrun-- sdk iphoneos-- show - sdk - path) endif IOS_SDK_VERSION
-    : = $(shell xcrun-- sdk iphoneos-- show - sdk - version) MIN_SDK_VERSION
-    : = 9.0
-#Override IOS_ARCH with armv7, armv7s, arm64, i386, or x86_64.
-      IOS_ARCH
-    : = x86_64 CXXFLAGS
-      += -miphoneos - version
-         - min = $(MIN_SDK_VERSION) - DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
-                 - fembed - bitcode - Wno - c++ 11 - narrowing - mno - thumb
-                 - fno - exceptions
-                 - isysroot ${IPHONEOS_SYSROOT} - arch $(IOS_ARCH) - O3 CCFLAGS
-      += -miphoneos - version
-         - min = $(MIN_SDK_VERSION) - fembed - bitcode - mno - thumb
-                 - isysroot ${IPHONEOS_SYSROOT} - arch $(IOS_ARCH) -
-                 O3 LDFLAGS
-    : = -fembed - bitcode - miphoneos - version
-        - min = ${MIN_SDK_VERSION} - arch $(IOS_ARCH) OBJDIR
-    : = $(OBJDIR) ios_$(IOS_ARCH) / LIBDIR
-    : = $(LIBDIR) ios_$(IOS_ARCH) / BINDIR
-    : = $(BINDIR) ios_$(IOS_ARCH) / DEPDIR : = $(DEPDIR) ios_$(IOS_ARCH) / endif
+# Settings for iOS.
+ifeq ($(TARGET), IOS)
+        BUILD_FOR_IOS_SIMULATOR := false
+	ifeq ($(IOS_ARCH), x86_64)
+	     	BUILD_FOR_IOS_SIMULATOR := true
+	endif
+	ifeq ($(IOS_ARCH), i386)
+	     	BUILD_FOR_IOS_SIMULATOR := true
+	endif
+	ifeq ($(BUILD_FOR_IOS_SIMULATOR), true)
+		IPHONEOS_PLATFORM := $(shell xcrun --sdk iphonesimulator \
+			--show-sdk-platform-path)
+		IPHONEOS_SYSROOT := $(shell xcrun --sdk iphonesimulator \
+			--show-sdk-path)
+	else
+		IPHONEOS_PLATFORM := $(shell xcrun --sdk iphoneos --show-sdk-platform-path)
+		IPHONEOS_SYSROOT := $(shell xcrun --sdk iphoneos --show-sdk-path)
+	endif
+	IOS_SDK_VERSION := $(shell xcrun --sdk iphoneos --show-sdk-version)
+	MIN_SDK_VERSION := 9.0
+	# Override IOS_ARCH with armv7, armv7s, arm64, i386, or x86_64.
+	IOS_ARCH := x86_64
+	CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
+		-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+		-fembed-bitcode \
+		-Wno-c++11-narrowing \
+		-mno-thumb \
+		-fno-exceptions \
+		-isysroot \
+		${IPHONEOS_SYSROOT} \
+		-arch $(IOS_ARCH) \
+		-O3
+	CCFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
+		-fembed-bitcode \
+		-mno-thumb \
+		-isysroot \
+		${IPHONEOS_SYSROOT} \
+		-arch $(IOS_ARCH) \
+		-O3
+	LDFLAGS := -fembed-bitcode \
+		-miphoneos-version-min=${MIN_SDK_VERSION} \
+		-arch $(IOS_ARCH)
+	OBJDIR := $(OBJDIR)ios_$(IOS_ARCH)/
+	LIBDIR := $(LIBDIR)ios_$(IOS_ARCH)/
+	BINDIR := $(BINDIR)ios_$(IOS_ARCH)/
+	DEPDIR := $(DEPDIR)ios_$(IOS_ARCH)/
+endif

diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index bdb5e01..8066889 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h

@@ -108,7 +108,7 @@
  * The type of operations that can be added to a model.
  */
 enum {
-  /** Adds two tensors, elment-wise.
+  /** Adds two tensors, element-wise.
    *
    * Takes two input tensors of identical type and compatible dimensions. The
    * output is the sum of both input tensors, optionally modified by an
@@ -743,7 +743,7 @@
    */
   ANEURALNETWORKS_MAX_POOL_2D = 17,
 
-  /** Multiplies two tensors, elment-wise.
+  /** Multiplies two tensors, element-wise.
    *
    * Takes two input tensors of identical type and compatible dimensions. The
    * output is the product of both input tensors, optionally modified by an

diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index e2e6c05..ee84b5b 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile

@@ -300,7 +300,7 @@
 	ifeq ($(ANDROID_ARCH),x86_64)
 		TOOLCHAIN := x86_64-4.9
 		SYSROOT_ARCH := x86_64
-		BIN_PREFIX := x86-64-linux-android
+		BIN_PREFIX := x86_64-linux-android
 		MARCH_OPTION :=
 	endif
     

diff --git a/tensorflow/contrib/nn/__init__.py b/tensorflow/contrib/nn/__init__.py
index 0bc133a..96d60e1 100644
--- a/tensorflow/contrib/nn/__init__.py
+++ b/tensorflow/contrib/nn/__init__.py

@@ -21,6 +21,7 @@
 @@deprecated_flipped_sigmoid_cross_entropy_with_logits
 @@nth_element
 @@rank_sampled_softmax_loss
+@@sampled_sparse_softmax_loss
 @@scaled_softplus
 """
 

diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
index 98749cf..63fc487 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py

@@ -24,6 +24,8 @@
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
 
 
 def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
@@ -240,3 +242,101 @@
         remove_accidental_hits=remove_accidental_hits,
         partition_strategy=partition_strategy,
         name=name)
+
+
+def sampled_sparse_softmax_loss(weights,
+                                biases,
+                                labels,
+                                inputs,
+                                num_sampled,
+                                num_classes,
+                                sampled_values=None,
+                                remove_accidental_hits=True,
+                                partition_strategy="mod",
+                                name="sampled_sparse_softmax_loss"):
+  """Computes and returns the sampled sparse softmax training loss.
+
+  This is a faster way to train a softmax classifier over a huge number of
+  classes.
+
+  This operation is for training only.  It is generally an underestimate of
+  the full softmax loss.
+
+  A common use case is to use this method for training, and calculate the full
+  softmax loss for evaluation or inference. In this case, you must set
+  `partition_strategy="div"` for the two losses to be consistent, as in the
+  following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.sampled_sparse_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...,
+        partition_strategy="div")
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=tf.squeeze(labels),
+        logits=logits)
+  ```
+
+  See our [Candidate Sampling Algorithms Reference]
+  (https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
+  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+        objects whose concatenation along dimension 0 has shape
+        [num_classes, dim].  The (possibly-sharded) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, 1]`.
+        The index of the single target class for each row of logits.  Note that
+        this format differs from the `labels` argument of
+        `nn.sparse_softmax_cross_entropy_with_logits`.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
+        activations of the input network.
+    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_classes: An `int`. The number of possible classes.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+        (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
+        where a sampled class equals one of the target classes.  Default is
+        True.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
+        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example sampled softmax losses.
+
+  """
+  logits, _ = nn_impl._compute_sampled_logits(
+      weights=weights,
+      biases=biases,
+      labels=labels,
+      inputs=inputs,
+      num_sampled=num_sampled,
+      num_classes=num_classes,
+      num_true=1,
+      sampled_values=sampled_values,
+      subtract_log_q=True,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy=partition_strategy,
+      name=name)
+
+  # There is only one true label. _compute_sampled_logits puts the true logit
+  # at index 0.
+  labels = array_ops.zeros([array_ops.shape(logits)[0], 1], dtype=dtypes.int64)
+
+  sampled_losses = nn_ops.sparse_softmax_cross_entropy_with_logits(
+      labels=array_ops.squeeze(labels), logits=logits)
+  # sampled_losses is a [batch_size] tensor.
+  return sampled_losses

diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index d2811f2..9c961f2 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD

@@ -16,6 +16,7 @@
         "__init__.py",
         "python/training/addsign.py",
         "python/training/drop_stale_gradient_optimizer.py",
+        "python/training/elastic_average_optimizer.py",
         "python/training/external_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/moving_average_optimizer.py",
@@ -174,6 +175,24 @@
     ],
 )
 
+tf_py_test(
+    name = "elastic_average_optimizer_test",
+    srcs = ["python/training/elastic_average_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "sign_decay_test",
     srcs = ["python/training/sign_decay_test.py"],

diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 04643a6..2025e8b 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py

@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+    # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.powersign import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -46,6 +47,8 @@
     'VariableClippingOptimizer',
     'MultitaskOptimizerWrapper',
     'clip_gradients_by_global_norm',
+    'ElasticAverageOptimizer', 
+    'ElasticAverageCustomGetter'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)

diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
new file mode 100644
index 0000000..9941f22
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py

@@ -0,0 +1,345 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper optimizer for Elastic Average SGD """
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import constant_op
+
+LOCAL_VARIABLE_NAME = 'local_center_variable'
+GLOBAL_VARIABLE_NAME = 'global_center_variable'
+
+
+class ElasticAverageCustomGetter(object):
+  """Custom_getter class is used to do:
+  1. Change trainable variables to local collection and place them at worker
+    device
+  2. Generate global variables(global center variables)
+  3. Generate local variables(local center variables) which record the global
+    variables and place them at worker device
+    Notice that the class should be used with tf.replica_device_setter,
+    so that the global center variables and global step variable can be placed
+    at ps device. Besides, use 'tf.get_variable' instead of 'tf.Variable' to
+    use this custom getter.
+
+  For example,
+  ea_custom_getter = ElasticAverageCustomGetter(worker_device)
+  with tf.device(
+    tf.train.replica_device_setter(
+      worker_device=worker_device,
+      ps_device="/job:ps/cpu:0",
+      cluster=cluster)),
+    tf.variable_scope('',custom_getter=ea_custom_getter):
+    hid_w = tf.get_variable(
+      initializer=tf.truncated_normal(
+          [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
+          stddev=1.0 / IMAGE_PIXELS),
+      name="hid_w")
+    hid_b = tf.get_variable(initializer=tf.zeros([FLAGS.hidden_units]),
+                            name="hid_b")
+  """
+
+  def __init__(self, worker_device):
+    """Create a new `ElasticAverageCustomGetter`.
+
+    Args:
+      worker_device: String.  Name of the `worker` job.
+    """
+    self._worker_device = worker_device
+    self._local_map = {}
+    self._global_map = {}
+
+  def __call__(self, getter, name, trainable, collections, *args, **kwargs):
+    if trainable:
+      with ops.device(self._worker_device):
+        local_var = getter(name, trainable=True,
+                           collections=[ops.GraphKeys.LOCAL_VARIABLES], 
+                           *args, **kwargs)
+        
+      global_center_variable = variable_scope.variable(
+        name='%s/%s' %
+             (GLOBAL_VARIABLE_NAME,
+              name),
+        initial_value=local_var.initialized_value(),
+        trainable=False,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+
+      with ops.device(self._worker_device):
+        local_center_variable = variable_scope.variable(
+          name='%s/%s' % (LOCAL_VARIABLE_NAME, name),
+          initial_value=local_var.initialized_value(),
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+        
+      self._local_map[local_var] = local_center_variable
+      self._global_map[local_var] = global_center_variable
+      return local_var
+    else:
+      return getter(name, trainable, collections, *args, **kwargs)
+
+
+class ElasticAverageOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that implements the Elastic Average SGD algorithm.
+  This is an async optimizer. During the training, Each worker will update
+  the local variables and maintains its own local_step, which starts from 0
+  and is incremented by 1 after each update of local variables. Whenever
+  the communication period divides the local step, the worker requests
+  the current global center variables and then computed the elastic difference
+  between global center variables and local variables. The elastic difference
+  then be used to update both local variables and global variables.
+  """
+
+  # Default value as paper described
+  BETA = 0.9
+
+  def __init__(
+      self,
+      opt,
+      num_worker,
+      ea_custom_getter,
+      communication_period=10,
+      moving_rate=None,
+      rho=None,
+      use_locking=True,
+      name="ElasticAverageOptimizer"):
+    """Construct a new gradient descent optimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to update local variables.
+        Must be one of the Optimizer classes.
+      num_worker: The number of workers
+      ea_custom_getter: The ElasticAverageCustomGetter
+      communication_period: An int point value to controls the frequency
+        of the communication between every worker and the ps.
+      moving_rate: A floating point value to control the elastic difference.
+      rho: the amount of exploration we allow ine the model. The default
+        value is moving_rate/learning_rate
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "ElasticAverageOptimizer".
+    """
+    super(ElasticAverageOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._num_worker = num_worker
+    self._period = communication_period
+    self._local_map = ea_custom_getter._local_map
+    self._global_map = ea_custom_getter._global_map
+
+    if moving_rate is None:
+      self._moving_rate = BETA / communication_period / num_worker
+    else:
+      self._moving_rate = moving_rate
+    if rho is None:
+      self._rho = self._moving_rate / self._opt._learning_rate
+    else:
+      self._rho = rho
+
+    self._local_step = variable_scope.get_variable(
+      initializer=0,
+      trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      name="local_step")
+    self._opt._prepare()
+
+  def compute_gradients(self, loss, var_list=None,
+                        gate_gradients=optimizer.Optimizer.GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+    
+    Add rho*elastic_difference to loss to control the exploration
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKey.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid.
+    """
+    if not var_list:
+      var_list = variables.trainable_variables()
+      
+    elastic_difference = [math_ops.subtract(v, lv) for v, lv in zip(
+      variables.trainable_variables(),
+      [self._local_map[var] for var in var_list])]
+
+    distance_loss = self._rho * math_ops.add_n(
+                      [gen_nn_ops.l2_loss(ed) for ed in elastic_difference])
+
+    total_loss = loss + distance_loss
+    return self._opt.compute_gradients(total_loss, var_list,
+                                       gate_gradients, aggregation_method,
+                                       colocate_gradients_with_ops, grad_loss)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to global variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    apply_updates = self._opt.apply_gradients(grads_and_vars)
+    with ops.control_dependencies([apply_updates]):
+      local_update = state_ops.assign_add(
+        self._local_step, 1, name='local_step_update').op
+
+    # update global variables.
+    def _Update_global_variables():
+      local_vars = [v for g, v in grads_and_vars if g is not None]
+      global_center_vars = [self._global_map[var] for var in local_vars]
+      local_center_vars = [self._local_map[var] for var in local_vars]
+      local_center_vars_update = []
+      for lvar, var in zip(local_center_vars, global_center_vars):
+        local_center_vars_update.append(lvar.assign(var))
+      update_ops = []
+      differences = []
+      with ops.control_dependencies(local_center_vars_update):
+        for v, lv in zip(local_vars, local_center_vars):
+          with ops.device(v.device):
+            differences.append(math_ops.subtract(v, lv))
+        for lvar, diff in zip(local_vars, differences):
+          with ops.device(lvar.device):
+            update_ops.append(state_ops.assign_sub(lvar, math_ops.multiply(
+              self._moving_rate, diff)))
+        for var, diff in zip(global_center_vars, differences):
+          with ops.device(var.device):
+            update_ops.append(state_ops.assign_add(var, math_ops.multiply(
+              self._moving_rate, diff)))
+        if global_step:
+          with ops.colocate_with(global_step):
+            update_ops.append(state_ops.assign_add(global_step, 1))
+      variable_update = control_flow_ops.group(*(update_ops))
+      return variable_update
+
+    with ops.control_dependencies([local_update]):
+      condition = math_ops.equal(math_ops.mod(
+        self._local_step, self._period), 0)
+      conditional_update = control_flow_ops.cond(
+        condition, _Update_global_variables, control_flow_ops.no_op)
+    return conditional_update
+
+  def get_init_op(self, task_index):
+    """Returns the op to let all the local variables and local center
+    variables equal to the global center variables before the training begins"""
+
+    def _Add_sync_queues_and_barrier(enqueue_after_list):
+      """Adds ops to enqueu on all worker queues"""
+      sync_queues = [
+        data_flow_ops.FIFOQueue(self._num_worker, [dtypes.bool], shapes=[[]],
+                                shared_name='%s%s' % (
+                                  'variable_init_sync_queue', i)) for i in
+        range(self._num_worker)]
+      queue_ops = []
+      # For each other worker, add an entry in a queue
+      token = constant_op.constant(False)
+      with ops.control_dependencies(enqueue_after_list):
+        for i, q in enumerate(sync_queues):
+          if i == task_index:
+            queue_ops.append(control_flow_ops.no_op())
+          else:
+            queue_ops.append(q.enqueue(token))
+      queue_ops.append(
+        sync_queues[task_index].dequeue_many(len(sync_queues) - 1))
+      return control_flow_ops.group(*queue_ops)
+
+    init_ops = []
+    local_vars = variables.trainable_variables()
+    global_center_vars = [self._global_map[var] for var in local_vars]
+    local_center_vars = [self._local_map[var] for var in local_vars]
+    if not (local_vars and global_center_vars and local_center_vars):
+      raise ValueError(
+        'The lists of local_variables, global_center_variables, '
+        'local_center_variables should not be empty  ')
+    for lvar, gc_var, lc_var in zip(
+        local_vars, global_center_vars, local_center_vars):
+      init_ops.append(state_ops.assign(lvar, gc_var))
+      init_ops.append(state_ops.assign(lc_var, gc_var))
+
+    init_op = control_flow_ops.group(*(init_ops))
+    sync_queue_op = _Add_sync_queues_and_barrier([init_op])
+    return sync_queue_op
+
+  def make_session_run_hook(self, is_chief, task_index):
+    """Creates a hook to handle ElasticAverageOptimizerHook ops such as initialization."""
+    return _ElasticAverageOptimizerHook(self, is_chief, task_index)
+
+
+class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
+  def __init__(self, ea_optimizer, is_chief, task_index):
+    """Creates hook to handle ElasticAverageOptimizer initialization ops.
+
+    Args:
+      ea_optimizer: `ElasticAverageOptimizer` which this hook will initialize.
+      is_chief: `Bool`, whether is this a chief replica or not.
+    """
+    self._ea_optimizer = ea_optimizer
+    self._is_chief = is_chief
+    self._task_index = task_index
+
+  def begin(self):
+    self._local_init_op = variables.local_variables_initializer()
+    self._global_init_op = None
+    if self._is_chief:
+      self._global_init_op = variables.global_variables_initializer()
+    self._variable_init_op = self._ea_optimizer.get_init_op(self._task_index)

diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
new file mode 100644
index 0000000..59e55fc
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py

@@ -0,0 +1,225 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ElasticAverageOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import device_setter
+
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import \
+  ElasticAverageOptimizer, ElasticAverageCustomGetter, GLOBAL_VARIABLE_NAME
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create local GRPC servers and return them."""
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+  cluster_dict = {
+    "worker": ["localhost:%s" % port for port in worker_ports],
+    "ps": ["localhost:%s" % port for port in ps_ports]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  workers = [
+    server_lib.Server(
+      cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+    for ix in range(num_workers)
+  ]
+  ps_servers = [
+    server_lib.Server(
+      cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+    for ix in range(num_ps)
+  ]
+
+  return cluster_dict, workers, ps_servers
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+# Cheif worker will update at last
+def _get_workers(num_workers, period, workers, moving_rate):
+  sessions = []
+  graphs = []
+  train_ops = []
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    is_chief = (worker_id == 0)
+    with graph.as_default():
+      worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
+      ea_coustom = ElasticAverageCustomGetter(
+        worker_device=worker_device)
+      with variable_scope.variable_scope('',
+                                         custom_getter=ea_coustom), ops.device(
+        device_setter.replica_device_setter(worker_device=worker_device,
+                                            ps_device="/job:ps/task:0/cpu:0",
+                                            ps_tasks=1)):
+        global_step = variables.Variable(0, name='global_step',
+                                         trainable=False)
+        var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
+        var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
+
+      with ops.device("/job:worker/task:" + str(worker_id)):
+        grads_0 = constant_op.constant(-1.0)
+        grads_1 = constant_op.constant(-1.0)
+
+        sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+        opt = ElasticAverageOptimizer(
+          opt=sgd_opt,
+          num_worker=num_workers,
+          moving_rate=moving_rate,
+          communication_period=period,
+          ea_custom_getter=ea_coustom
+        )
+        train_op = [
+          opt.apply_gradients(
+            ([grads_0, var_0],
+             [grads_1, var_1]), global_step)
+        ]
+        easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
+      # Creates MonitoredSession
+      sess = training.MonitoredTrainingSession(workers[worker_id].target,
+                                               hooks=[easgd_hook])
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+
+  return sessions, graphs, train_ops
+
+
+class ElasticAverageOptimizerTest(test.TestCase):
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Workers2Period(self):
+    num_workers = 1
+    communication_period = 2
+    num_ps = 1
+    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
+                                               num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(num_workers,
+                                               communication_period,
+                                               workers, 1.0)
+
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    global_step = training_util.get_global_step(graphs[0])
+    var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
+    var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    # iteration 2, global varibale update
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(2.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(3.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+    # iteration 3
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(2.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(3.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def test2Worker1Period(self):
+    num_workers = 2
+    communication_period = 1
+    num_ps = 2
+    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
+                                               num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(num_workers,
+                                               communication_period,
+                                               workers, 0.5)
+
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+
+    var_0_1 = graphs[1].get_tensor_by_name('v0:0')
+    var_1_1 = graphs[1].get_tensor_by_name('v1:0')
+
+    var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
+    var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[1].run(var_0_1))
+    self.assertAllEqual(1.0, sessions[1].run(var_1_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+
+    sessions[0].run(train_ops[0])
+    sessions[1].run(train_ops[1])
+
+    self.assertAllEqual(0.5, sessions[0].run(var_0))
+    self.assertAllEqual(1.5, sessions[0].run(var_1))
+    self.assertAllEqual(0.75, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.75, sessions[0].run(var_1_g))
+    self.assertAllEqual(0.75, sessions[1].run(var_0_1))
+    self.assertAllEqual(1.75, sessions[1].run(var_1_1))
+
+  def testPS2TasksWithClusterSpecClass(self):
+    cluster_spec = server_lib.ClusterSpec({
+      "ps": ["ps0:2222", "ps1:2222"],
+      "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    ea_coustom = ElasticAverageCustomGetter(
+      worker_device="/job:worker/task:0")
+    from tensorflow.python.training import device_setter
+    with ops.device(
+        device_setter.replica_device_setter(cluster=cluster_spec,
+                                            worker_device="/job:worker/task:0",
+                                            ps_device="/job:ps")), \
+         variable_scope.variable_scope('', custom_getter=ea_coustom):
+      v = variable_scope.get_variable(initializer=[1, 2], name="v")
+      w = variable_scope.get_variable(initializer=[2, 1], name='w')
+      v_g, w_g = ea_coustom._global_map[v],ea_coustom._global_map[w]
+      self.assertDeviceEqual("/job:worker/task:0", v.device)
+      self.assertDeviceEqual("job:ps/task:0", v_g.device)
+      self.assertDeviceEqual("/job:worker/task:0", w.device)
+      self.assertDeviceEqual("job:ps/task:1", w_g.device)
+
+
+if __name__ == '__main__':
+  test.main()

diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 84fcf73..63155fa 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py

@@ -39,6 +39,9 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.framework import test_util
+from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
+
 
 # pylint: enable=protected-access
 Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name

diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 5a6d287..c6b1316 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py

@@ -36,6 +36,7 @@
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 

diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index b55d90c..dec03ce 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py

@@ -223,8 +223,7 @@
 
   def sample(self, time, outputs, name=None, **unused_kwargs):
     with ops.name_scope(name, "TrainingHelperSample", [time, outputs]):
-      sample_ids = math_ops.cast(
-          math_ops.argmax(outputs, axis=-1), dtypes.int32)
+      sample_ids = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32)
       return sample_ids
 
   def next_inputs(self, time, outputs, state, name=None, **unused_kwargs):
@@ -540,8 +539,7 @@
     if not isinstance(outputs, ops.Tensor):
       raise TypeError("Expected outputs to be a single Tensor, got: %s" %
                       type(outputs))
-    sample_ids = math_ops.cast(
-        math_ops.argmax(outputs, axis=-1), dtypes.int32)
+    sample_ids = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32)
     return sample_ids
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):

diff --git a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
index a781c64..c42c7b3 100644
--- a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
+++ b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py

@@ -62,7 +62,9 @@
                seed=None,
                scope=None):
     """Creates a DatasetDataProvider.
-
+    Note: if `num_epochs` is not `None`,  local counter `epochs` will be created
+    by relevant function. Use `local_variables_initializer()` to initialize
+    local variables.
     Args:
       dataset: An instance of the Dataset class.
       num_readers: The number of parallel readers to use.

diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index 5896fc2..f0330bf 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py

@@ -19,7 +19,7 @@
 
 import re
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
@@ -79,7 +79,7 @@
 
     train_op = optimizers.optimize_loss(
         model_outputs.loss,
-        global_step=variables.get_global_step(),
+        global_step=training_util.get_global_step(),
         optimizer=self.optimizer,
         # Learning rate is set in the Optimizer object
         learning_rate=None)

diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index c51b172..ee9988f 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc

@@ -348,7 +348,7 @@
                                  kernel);
   }
 
-  // Try to instantiate this function for the func/attr. Maybe its
+  // Try to instantiate this function for the func/attr. Maybe it's
   // cached already.
   Handle handle;
   TF_RETURN_IF_ERROR(Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));

diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index edd952b..8514d7c 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h

@@ -46,7 +46,7 @@
   EIGEN_DEVICE_FUNC bfloat16() {}
 
   EIGEN_DEVICE_FUNC explicit bfloat16(const float v) {
-    if (isnan(v)) {
+    if (Eigen::numext::isnan(v)) {
       value = NAN_VALUE;
       return;
     }

diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 06c576c..d8a9c0b 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc

@@ -359,7 +359,8 @@
   for (const auto& d : proto.dim()) {
     if (d.size() < 0) {
       return errors::InvalidArgument("Shape ", DebugString(proto),
-                                     " has negative dimensions");
+                                     " has negative dimensions; ",
+                                     "perhaps an un-fed placeholder?");
     }
     num_elements *= d.size();
     if (num_elements > kMaxElements) {

diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 880e4e7..3df9814 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h

@@ -76,12 +76,12 @@
 namespace mkl_op_registry {
   static const char* kMklOpLabel = "MklOp";
   static const char* kMklOpLabelPattern = "label='MklOp'";
+  // Prefix that we add to Tensorflow op name to construct Mkl op name.
+  static const char* const kMklOpPrefix = "_Mkl";
 
   // Get the name of Mkl op from original TensorFlow op
   // We prefix 'Mkl' to the original op to get Mkl op.
   inline string GetMklOpName(const string& name) {
-    // Prefix that we add to Tensorflow op name to construct Mkl op name.
-    const char* const kMklOpPrefix = "_Mkl";
     return string(kMklOpPrefix) + name;
   }
 
@@ -94,9 +94,6 @@
     string kernel = KernelsRegisteredForOp(op_name);
     bool result =
         kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-    if (result) {
-      VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
-    }
     return result;
   }
 
@@ -112,15 +109,12 @@
     if (!IsMklOp(op_name, T)) {
       return false;
     }
-
     bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
                     0 == op_name.compare(GetMklOpName("Sub")) ||
                     0 == op_name.compare(GetMklOpName("Mul")) ||
                     0 == op_name.compare(GetMklOpName("Maximum")) ||
                     0 == op_name.compare(GetMklOpName("SquaredDifference")));
 
-    VLOG(1) << "mkl_op_registry::" << op_name
-            << " is elementwise MKL op: " << result;
     return result;
   }
 }  // namespace mkl_op_registry

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 912075a..3beca1e 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc

@@ -42,6 +42,8 @@
 
 namespace tensorflow {
 
+#ifndef INTEL_MKL_DNN
+
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
 // (B) Rewriting a node in the graph to a new node
@@ -2213,6 +2215,2087 @@
   return Status::OK();
 }
 
+#else  // INTEL_MKL_DNN
+
+// This pass implements rewriting of graph to support following scenarios:
+// (A) Merging nodes in the graph
+// (B) Rewriting a node in the graph to a new node
+//     Rewrite happens under following scenario:
+//     - Propagating Mkl layout as an additional output tensor
+//        (we will loosely call a tensor that carries Mkl layout as Mkl tensor
+//         henceforth.) from every Mkl supported NN layer.
+//
+// Example of A : Merging nodes in the graph
+// -----------------------------------------
+// Currently, we merge Conv2D+AddBias together. Consider Conv2D and BiasAdd as:
+//
+//           O = Conv2D(A, B)
+//           P = BiasAdd(O, C)
+//
+// We merge them into Conv2DWithBias as:
+//           P = _MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
+//
+// The meaning of A_m, B_m and C_m is explained in B.1.
+//
+// Merge rules:
+//  - The merge for Conv2D and BiasAdd happens when the output of Conv2D _only_
+//    goes to BiasAdd.
+//  - Also, the intersection of attributes of both the nodes must have same
+//    values.
+//  - Both the nodes must have been assigned to same device (if any).
+//
+// Example of B.1 : Rewriting nodes to Mkl nodes
+// ---------------------------------------------
+// Consider a Relu node. Current definition of Relu node looks like:
+//
+//           O = Relu(A)
+//
+// Relu has 1 input (A), and 1 output (O).
+//
+// This rewrite pass will generate a new graph node for Relu (new node is
+// called MklRelu) as:
+//
+//          O, O_m = MklRelu(A, A_m)
+//
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here input A is
+// same as input A of Relu; output O is same as output O of Relu. O_m is the
+// additional output tensor that will be set by MklRelu, and it represents
+// Mkl tensor corresponding to O -- in other words, O_m is some kind of
+// metadata for O. A_m is additional input of Relu, and it represents metadata
+// for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
+// this metadata from previous node in the graph.
+//
+// When a previous node in the graph is an Mkl node, A_m will represent a valid
+// Mkl tensor. But when a previous node is not an Mkl node, A_m will represent
+// a dummy Mkl tensor.
+//
+// Rewriting rules:
+//  - Selection of a node for rewriting happens by registering the op type of
+//    the node with the rewriting pass. If the op type is not registered, then
+//    all nodes of this op type will not be rewritten.
+//  - Number of inputs after rewriting:
+//      Since for every input Tensorflow tensor, the rewritten node gets Mkl
+//      tensor(s), rewritten node gets 2*N inputs, where N is the number of
+//      inputs for the original node.
+//  - Number of outputs after rewriting:
+//      Since for every output Tensorflow tensor, the rewritten node generates
+//      Mkl tensor(s), the rewritten node generates 2*N outputs, where N is the
+//      number of outputs of the original node.
+//  - Ordering of Tensorflow tensors and Mkl tensors:
+//      Since every rewritten node generates twice the number of inputs and
+//      outputs, one could imagine various orderings among Tensorflow tensors
+//      and Mkl tensors. E.g., assume an op 'Conv2D' that takes (A, B) as
+//      inputs, then the new op '_MklConv2D' can take inputs A, B, A_m and B_m
+//      in A, A_m, B, B_m order or it can also take them in A, B, A_m, B_m
+//      order. Among N inputs one can get N! permutations.
+//
+//      So the question is: which order do we follow? We support 2 types of
+//      orderings: (1) interleaved, and (2) contiguous. Interleaved ordering
+//      follows an intuitive order where an Mkl tensor follows the
+//      corresponding Tensorflow tensor immediately. In the context of the
+//      above example, it will be: A, A_m, B, B_m. Note that the ordering rule
+//      applies to both the inputs and outputs. Contiguous ordering means
+//      all the Tensorflow tensors are contiguous followed by all the Mkl
+//      tensors. We use contiguous ordering as default.
+//
+// Graph rewrite algorithm:
+//      Algorithm: Graph Rewrite
+//      Input: Graph G, Names of the nodes to rewrite and their new names
+//      Output: Modified Graph G' if the nodes are modified, G otherwise.
+//      Start:
+//        N = Topological_Sort(G) // N is a set of nodes in toposort order.
+//        foreach node n in N
+//        do
+//          if (Is_MKL_Op(n))  // Can this node accept an Mkl layout as input.
+//          then
+//            E = set of <incoming edge and its src_output slot> of n
+//            E' = {}   // a new set of edges for rewritten node
+//            foreach <e,s> in E
+//            do
+//              E' U {<e,s>}  // First copy edge which generates Tensorflow
+//                            // tensor as it is
+//              m = Source node of edge e
+//              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
+//              then
+//                E' U {<m,s+1>}    // If yes, then m will generate an Mkl
+//                                  // tensor as an additional output.
+//              else
+//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate a dummy
+//                                                 // Mkl tensor.
+//                E' U {<d,0>}  // The dummy Mkl tensor has only 1 output slot.
+//              fi
+//            done
+//            n' = Build_New_Node(G,new_name,E')
+//            Mark_Rewritten(n')  // Mark the new node as being rewritten.
+//          fi
+//        done
+//
+//      Explanation:
+//        For graph rewrite, we visit nodes of the input graph in the
+//        topological sort order. With this ordering, we visit nodes in the
+//        top-to-bottom fashion. We need this order because while visiting a
+//        node we want that all of its input nodes are visited and rewritten if
+//        applicable. This is because if we need to rewrite a given node
+//        then all of its input nodes need to be fixed (in other words they
+//        cannot be deleted later.)
+//
+//        While visiting a node, we first check if the op type of the node is
+//        an Mkl op. If it is, then we rewrite that node after constructing
+//        new inputs to the node. If the op type of the node is not Mkl op,
+//        then we do not rewrite that node.
+//
+// Handling workspace propagation for certain ops:
+//
+//        Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
+//        passing of a workspace from their respective forward ops. Workspace
+//        tensors provide memory for storing results of intermediate operations
+//        which are helpful in backward propagation. TensorFlow does not have
+//        a notion of a workspace and as a result does not allow producing
+//        additional outputs from these forward ops. For these ops, we need
+//        to add 2 extra edges between forward ops and their corresponding
+//        backward ops - the first extra edge carries a workspace tensor and
+//        the second one carries an Mkl tensor for the workspace tensor.
+//
+//        Example:
+//
+//        Typical graph for MaxPool and its gradient looks like:
+//
+//        A = MaxPool(T)
+//        B = MaxPoolGrad(X, A, Y)
+//
+//        We will transform this graph to propagate the workspace as:
+//        (with the contiguous ordering)
+//
+//        A, W, A_m, W_m = MklMaxPool(T, T_m)
+//        B, B_m = MklMaxPoolGrad(X, A, Y, W, X_m, A_m, Y_m, W_m)
+//
+//        Here W is the workspace tensor. Transformed tensor names with the
+//        suffix _m are Mkl tensors, and this transformation has been done
+//        using the algorithm discussed earlier. The transformation for
+//        workspace propagation only adds extra outputs (W, W_m) for a forward
+//        op and connects them to the corresponding backward ops.
+//
+//        Terms:
+//
+//        Forward op name = name of the op in the forward pass
+//          where a workspace tensor originates (MaxPool in this example)
+//        Backward op name = name of the op in the backward pass that receives
+//          a workspace tensor from the forward op (MaxPoolGrad in the example)
+//        Slot = Position of the output or input slot that will be
+//               used by the workspace tensor (1 for MklMaxPool as W is the 2nd
+//               output of MaxPool (0 is 1st); 3 for MklMaxPoolGrad)
+//
+//        Question:
+//
+//        How do we associate a backward op to a forward op? There can be more
+//        than one op with the exact same name.
+//
+//        In this example, we associate MaxPoolGrad with MaxPool. But there
+//        could be more than one MaxPool ops. To solve this problem, we look
+//        for _direct_ edge between a forward op and a backward op (tensor A is
+//        flowing along this edge in the example).
+//
+//        How do we transform forward and backward ops when there is no direct
+//        edge between them? In such a case, we generate dummy tensors for
+//        workspace tensors. For the example, transformation of MaxPool will
+//        be exactly same as it would be when there is a direct edge between
+//        the forward and the backward op --- it is just that MaxPool won't
+//        generate any workspace tensor. For MaxPoolGrad, the transformation
+//        will also be same, but instead of connecting W and W_m with the
+//        outputs of MaxPool, we will produce dummy tensors for them, and we
+//        will set workspace_enabled attribute to false.
+//
+class MklLayoutRewritePass : public GraphOptimizationPass {
+ public:
+  MklLayoutRewritePass() {
+    // NOTE: names are alphabetically sorted.
+    csinfo_.addn = "AddN";
+    csinfo_.avg_pool = "AvgPool";
+    csinfo_.avg_pool_grad = "AvgPoolGrad";
+    csinfo_.bias_add = "BiasAdd";
+    csinfo_.bias_add_grad = "BiasAddGrad";
+    csinfo_.concat = "Concat";
+    csinfo_.concatv2 = "ConcatV2";
+    csinfo_.conv2d = "Conv2D";
+    csinfo_.conv2d_with_bias = "__MklDummyConv2DWithBias";
+    csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
+    csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
+    csinfo_.conv2d_grad_filter_with_bias =
+                              "__MklDummyConv2DBackpropFilterWithBias";
+    csinfo_.fused_batch_norm = "FusedBatchNorm";
+    csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.identity = "Identity";
+    csinfo_.lrn = "LRN";
+    csinfo_.lrn_grad = "LRNGrad";
+    csinfo_.matmul = "MatMul";
+    csinfo_.max_pool = "MaxPool";
+    csinfo_.max_pool_grad = "MaxPoolGrad";
+    csinfo_.mkl_conv2d = "_MklConv2D";
+    csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput";
+    csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
+    csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
+    csinfo_.mkl_conv2d_grad_filter_with_bias =
+                                   "_MklConv2DBackpropFilterWithBias";
+    csinfo_.relu = "Relu";
+    csinfo_.relu_grad = "ReluGrad";
+    csinfo_.tanh       = "Tanh";
+    csinfo_.tanh_grad  = "TanhGrad";
+    csinfo_.reshape = "Reshape";
+    csinfo_.softmax = "Softmax";
+    csinfo_.split = "Split";
+    // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
+    // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
+    // MklInputConversion op is added before it.
+    csinfo_.add = "Add";
+    csinfo_.maximum = "Maximum";
+    csinfo_.mul = "Mul";
+    csinfo_.squared_difference = "SquaredDifference";
+    csinfo_.sub = "Sub";
+    // End - element-wise ops. See note above.
+
+    // NOTE: names are alphabetically sorted.
+    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
+                      CopyAttrsAddN, AddNRewrite});
+    rinfo_.push_back({csinfo_.add,
+                      mkl_op_registry::GetMklOpName(csinfo_.add),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avg_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avg_pool_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.concat,
+                      mkl_op_registry::GetMklOpName(csinfo_.concat),
+                      CopyAttrsConcat, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.concatv2,
+                      mkl_op_registry::GetMklOpName(csinfo_.concatv2),
+                      CopyAttrsConcatV2, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_with_bias,
+                      csinfo_.mkl_conv2d_with_bias,
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
+                      csinfo_.mkl_conv2d_grad_filter_with_bias,
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_input,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.fused_batch_norm,
+                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.identity,
+                      mkl_op_registry::GetMklOpName(csinfo_.identity),
+                      CopyAttrsDataType, AlwaysRewrite});
+    /*
+    rinfo_.push_back({csinfo_.lrn,
+                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+                      CopyAttrsLRN, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.lrn_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
+                      CopyAttrsLRN, AlwaysRewrite});
+    */
+    rinfo_.push_back({csinfo_.max_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.max_pool),
+                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite});
+    rinfo_.push_back({csinfo_.max_pool_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.maximum,
+                      mkl_op_registry::GetMklOpName(csinfo_.maximum),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.mul,
+                      mkl_op_registry::GetMklOpName(csinfo_.mul),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.tanh,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.tanh_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.reshape,
+                      mkl_op_registry::GetMklOpName(csinfo_.reshape),
+                      CopyAttrsReshape, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.softmax,
+                      mkl_op_registry::GetMklOpName(csinfo_.softmax),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.squared_difference,
+                      mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.sub,
+                      mkl_op_registry::GetMklOpName(csinfo_.sub),
+                      CopyAttrsDataType, AlwaysRewrite});
+
+    // Add info about which ops to add workspace edge to and the slots.
+    wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
+    wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
+
+    // Add a rule for merging nodes
+    minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
+                      csinfo_.conv2d_with_bias,
+                      GetConv2DOrBiasAdd});
+
+    minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
+                      csinfo_.conv2d_grad_filter_with_bias,
+                      GetConv2DBackpropFilterOrBiasAddGrad});
+  }
+
+  // Standard interface to run pass
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Helper function which does most of heavy lifting for rewriting
+  // Mkl nodes to propagate Mkl tensor as additional output
+  //
+  // Extracts common functionality between Run public interface and
+  // test interface.
+  //
+  // @return true, if and only if graph is mutated; false otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+  /// Structure to specify the name of an original node, its new name after
+  /// rewrite, the number of inputs to the original node, the function to
+  /// be used to copy attributes for the op, and the rule (if any) which
+  /// must hold for rewriting the node
+  typedef struct {
+    string name;      // Original name of op of the node in the graph
+    string new_name;  // New name of the op of the node in the graph
+    // A function handler to copy attributes from an old node to a new node.
+    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    // A rule under which to rewrite this node
+    std::function<bool(const Node*)> rewrite_rule;
+  } RewriteInfo;
+
+  /// Structure to specify a forward op, a backward op, and the slot numbers
+  /// in the forward and backward ops where we will add a workspace edge.
+  typedef struct {
+    string fwd_op;    // Name of a forward op in the graph
+    string bwd_op;    // Name of a backward op in the graph
+    int fwd_slot;     // Output slot in the forward op node where actual
+                      // output tensor resides
+    int bwd_slot;     // Input slot in the backward op node where actual
+                      // input tensor resides
+    int ws_fwd_slot;  // Output slot in the forward op node where workspace
+                      // edge is added
+    int ws_bwd_slot;  // Input slot in the backward op node where workspace
+                      // edge is added
+  } WorkSpaceInfo;
+
+  /// Structure to specify information used in node merge of 2 operators
+  typedef struct {
+    string op1;       // Node string for one operator.
+    string op2;       // Node string for second operator.
+    string new_node;  // Name of the node after merge
+    // Function that enables user of the node merger to specify how to find
+    // second operator given the first operator.
+    std::function<Node*(const Node*)> get_node_to_be_merged;
+  } MergeInfo;
+
+  /// Structure to store all constant strings
+  /// NOTE: names are alphabetically sorted.
+  typedef struct {
+    string addn;
+    string add;
+    string avg_pool;
+    string avg_pool_grad;
+    string bias_add;
+    string bias_add_grad;
+    string concat;
+    string concatv2;
+    string conv2d;
+    string conv2d_with_bias;
+    string conv2d_grad_input;
+    string conv2d_grad_filter;
+    string conv2d_grad_filter_with_bias;
+    string fused_batch_norm;
+    string fused_batch_norm_grad;
+    string identity;
+    string lrn;
+    string lrn_grad;
+    string matmul;
+    string max_pool;
+    string max_pool_grad;
+    string maximum;
+    string mkl_conv2d;
+    string mkl_conv2d_grad_input;
+    string mkl_conv2d_grad_filter;
+    string mkl_conv2d_grad_filter_with_bias;
+    string mkl_conv2d_with_bias;
+    string mul;
+    string relu;
+    string relu_grad;
+    string tanh;
+    string tanh_grad;
+    string reshape;
+    string softmax;
+    string split;
+    string squared_difference;
+    string sub;
+  } ConstStringsInfo;
+
+ private:
+  /// Maintain info about nodes to rewrite
+  std::vector<RewriteInfo> rinfo_;
+
+  /// Maintain info about nodes to add workspace edge
+  std::vector<WorkSpaceInfo> wsinfo_;
+
+  /// Maintain info about nodes to be merged
+  std::vector<MergeInfo> minfo_;
+
+  /// Maintain structure of constant strings
+  static ConstStringsInfo csinfo_;
+
+ private:
+  // Is OpDef::ArgDef a list type? It could be N * T or list(type).
+  // Refer to opdef.proto for details of list type.
+  inline bool ArgIsList(const OpDef::ArgDef& arg) const {
+    return !arg.type_list_attr().empty() || !arg.number_attr().empty();
+  }
+
+  // Get length of a list in 'n' if 'arg' is of list type. Refer to
+  // description of ArgIsList for definition of list type.
+  inline int GetTensorListLength(const OpDef::ArgDef& arg, Node* n) {
+    CHECK_EQ(ArgIsList(arg), true);
+    int N = 0;
+    const string attr_name = !arg.type_list_attr().empty()
+                                 ? arg.type_list_attr()
+                                 : arg.number_attr();
+    if (!arg.type_list_attr().empty()) {
+      std::vector<DataType> value;
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &value));
+      N = value.size();
+    } else {
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &N));
+    }
+    return N;
+  }
+
+  // Can op represented by node 'n' run on DEVICE_CPU?
+  // Op can run on CPU with MKL if the runtime assigned device or the
+  // user requested device contains device CPU, or both are empty.
+  bool CanOpRunOnCPUDevice(const Node* n) {
+    bool result = true;
+    string reason;
+
+    // Substring that should be checked for in device name for CPU device.
+    const char* const kCPUDeviceSubStr = "CPU";
+
+    // If Op has been specifically assigned to a non-CPU device, then No.
+    if (!n->assigned_device_name().empty() &&
+        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "Op has been assigned a runtime device that is not CPU.";
+    }
+
+    // If user has specifically assigned this op to a non-CPU device, then No.
+    if (!n->def().device().empty() &&
+        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "User has assigned a device that is not CPU.";
+    }
+
+    if (result == false) {
+      VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
+              << n->type_string() << ", reason: " << reason;
+    }
+
+    // Otherwise Yes.
+    return result;
+  }
+
+  // Return a node that can be merged with input node 'n'
+  //
+  // @return pointer to the node if we can find such a
+  // node. Otherwise, it returns nullptr.
+  Node* CheckForNodeMerge(const Node* n) const;
+
+  // Merge node 'm' with node 'n'.
+  // Currently, we merge (1) Conv2D with BiasAdd, and (2) BiasAddGrad with
+  // Conv2DBackpropFilter.
+  //
+  // Input nodes m and n may be deleted if the call to
+  // this function is successful. Attempt to use the pointers
+  // after the call to function may result in undefined behaviors.
+  //
+  // @input g - input graph, m - graph node, n - graph node to be merged with m
+  // @return Status::OK(), if merging is successful and supported.
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case nodes are merged. Otherwise, it is
+  //         not updated.
+  Status MergeNode(std::unique_ptr<Graph>* g, Node* m, Node* n);
+
+  // Helper function to merge different nodes
+  Status MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g, Node* m, Node* n);
+  Status MergeConv2DBackpropFilterWithBiasAddGrad(std::unique_ptr<Graph>* g,
+                                                  Node* m, Node* n);
+
+  // Find BiasAdd or Conv2D node that can be merged with input node 'm'.
+  // If input 'm' is BiasAdd, then check if there exists Conv2D node that can be
+  // merged with 'm'. If input 'm' is Conv2D, then check if there exists BiasAdd
+  // node that can be merged with 'm'.
+  static Node* GetConv2DOrBiasAdd(const Node* m) {
+    CHECK_NOTNULL(m);
+    Node* n = nullptr;
+
+    if (m->type_string() == csinfo_.bias_add) {
+      // If a is BiasAdd, then Conv2D is 0th input of BiasAdd.
+      TF_CHECK_OK(m->input_node(0, &n));
+    } else {
+      CHECK_EQ(m->type_string(), csinfo_.conv2d);
+      // Go over all output edges and search for BiasAdd Node.
+      // 0th input of BiasAdd is Conv2D.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.bias_add &&
+            e->dst_input() == 0) {
+          n = e->dst();
+          break;
+        }
+      }
+    }
+
+    if (n == nullptr) {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Conv2D and BiasAdd node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
+
+  // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
+  // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
+  // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
+  // then check if there exists Conv2DBackpropFilter node that can be merged
+  // with 'm'.
+  //
+  // Graph that will allow us to connect Conv2DBackpropFilter with BiasAddGrad
+  // would look like:
+  //
+  // _ = Conv2DBackpropFilter(F, _, G)
+  // _ = BiasAddGrad(G)
+  //
+  // So 1st input of BiasAddGrad connects with 3rd input of
+  // Conv2DBackpropFilter and vice versa.
+  static Node* GetConv2DBackpropFilterOrBiasAddGrad(const Node* m) {
+    CHECK_NOTNULL(m);
+    Node* n = nullptr;
+
+    if (m->type_string() == csinfo_.bias_add_grad) {
+      // Get 1st input 'g' of BiasAddGrad.
+      Node* g = nullptr;
+      TF_CHECK_OK(m->input_node(0, &g));
+      // Now traverse all outgoing edges from g that have destination node as
+      // Conv2DBackpropFilter.
+      for (const Edge* e : g->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.conv2d_grad_filter &&
+            e->dst_input() == 2 /* 3rd input of BackpropFilter */) {
+          n = e->dst();
+          break;
+        }
+      }
+    } else {
+      CHECK_EQ(m->type_string(), csinfo_.conv2d_grad_filter);
+      // Get 3rd input 'g' of Conv2DBackpropFilter.
+      Node* g = nullptr;
+      TF_CHECK_OK(m->input_node(2, &g));
+      // Now traverse all outgoing edges from g that have destination node as
+      // BiasAddGrad.
+      for (const Edge* e : g->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.bias_add_grad &&
+            e->dst_input() == 0 /* 1st input of BiasAddGrad */) {
+          n = e->dst();
+          break;
+        }
+      }
+    }
+
+    if (n == nullptr) {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Conv2DBackpropFilter and BiasAddGrad node for merging. "
+              << "Input node: " << m->DebugString();
+    }
+    return n;
+  }
+
+  // Check if the node 'n' has any applicable rewrite rule
+  // We check for 2 scenarios for rewrite.
+  //
+  // @return RewriteInfo* for the applicable rewrite rule
+  const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
+
+  // Default rewrite rule to be used in scenario 1 for rewrite.
+  // @return - true (since we want to always rewrite)
+  static bool AlwaysRewrite(const Node* n) {
+    return true;
+  }
+
+  // Check if we are performing pooling on depth or batch. If it is, then we
+  // do not rewrite MaxPool node to Mkl version.
+  // @return - true (if it is not a depth/batch wise pooling case);
+  //           false otherwise.
+  static bool NonDepthBatchWisePoolRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    string data_format_str;
+    TensorFormat data_format;
+    std::vector<int32> ksize, strides;
+    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
+             true);
+    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+        GetTensorDim(strides, data_format, 'N') == 1 &&
+        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(strides, data_format, 'C') == 1) {
+      return true;
+    }
+
+    return false;
+  }
+
+  static bool AddNRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    int num;
+    CHECK_EQ(GetNodeAttr(n->def(), "N", &num).ok(), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (num == 2) {
+      return true;
+    }
+
+    return false;
+  }
+
+  // Rewrites input node to a new node specified by its matching rewrite info.
+  //
+  // Method first searches matching rewrite info for input node and then
+  // uses that info to rewrite.
+  //
+  // Input node may be deleted in case of rewrite. Attempt to use the node
+  // after the call can result in undefined behaviors.
+  //
+  // @input  g - input graph, n - Node to be rewritten,
+  //         ri - matching rewriteinfo
+  // @return Status::OK(), if the input node is rewritten;
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case the input node is rewritten.
+  //         Otherwise, it is not updated.
+  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const RewriteInfo* ri);
+
+  // Get nodes that will feed a list of TF tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of TF tensors
+  // @output output_nodes - the list of new nodes creating TF tensors
+  //
+  // @return None
+  void GetNodesProducingTFTensorList(
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get nodes that will feed a list of Mkl tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of Mkl tensors
+  // @output output_nodes - the list of new nodes creating Mkl tensors
+  //
+  // @return None
+  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
+    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get a node that will feed an Mkl tensor to the new
+  // node that we are constructing. The output node could be (1) 'n'
+  // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+  // if 'n' is not an Mkl layer.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting,
+  // @input n - Node based on which we are creating Mkl node,
+  // @input n_output_slot - the output slot of node 'n'
+  //            which is feeding to the node that we are constructing
+  // @output mkl_node - the new node that will feed Mkl tensor
+  // @output mkl_node_output_slot - the slot number of mkl_node that
+  //                                will feed the tensor
+  // @return None
+  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
+    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
+  // set up in contiguous fashion. 'workspace_tensors' carry graph nodes
+  // producing workspace edges if 'are_workspace_tensors_available' is true.
+  // Otherwise, 'workspace_tensors' is empty vector.
+  //
+  // For details, refer to 'Ordering of inputs after rewriting' section in the
+  // documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  int SetUpContiguousInputs(
+      std::unique_ptr<Graph>* g,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+      NodeBuilder* nb, Node* old_node,
+      std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+      bool are_workspace_tensors_available);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'orig_node'.
+  //
+  // For details, refer to 'Ordering of Tensorflow tensors and Mkl tensors'
+  // section in the documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  Status SetUpInputs(std::unique_ptr<Graph>* g,
+                     const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+                     NodeBuilder* nb, Node* orig_node);
+
+  // Add workspace edge on the input or output side of Node 'orig_node' by using
+  // NodeBuilder 'nb' for the new node provided. If 'orig_node' does not dictate
+  // adding workspace edge then do not add it. Workspace Tensorflow and Mkl
+  // tensors, if they need to be added, will be set into these tensors.
+  // If we set workspace tensors, then are_ws_tensors_added should be true.
+  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orig_node,
+                                NodeBuilder* nb,
+                                std::vector<NodeBuilder::NodeOut>* ws_tensors,
+                                bool* are_ws_tensors_added);
+
+  // Functions specific to operators to copy attributes
+  // We need operator-specific function to copy attributes because the framework
+  // does not provide any generic function for it.
+  // NOTE: names are alphabetically sorted.
+  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
+
+  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
+  // using node for original node 'orig_node' and return it in '*out'.
+  // TODO(nhasabni) We should move this to mkl_util.h
+  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                             Node* orig_node);
+  void GetDummyWorkspaceTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                                   Node* orig_node);
+};
+
+MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
+
+// We register Mkl rewrite pass for phase 1 in post partitioning group.
+// We register it here so that we get a complete picture of all users of Mkl
+// nodes. Do not change the ordering of the Mkl passes.
+const OptimizationPassRegistry::Grouping kMklLayoutRewritePassGroup =
+    OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklLayoutRewritePassGroup, 1, MklLayoutRewritePass);
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for creating new node
+//////////////////////////////////////////////////////////////////////////
+
+static void FillInputs(const Node* n,
+                       gtl::InlinedVector<Node*, 4>* control_edges,
+                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
+  control_edges->clear();
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      control_edges->push_back(e->src());
+    } else {
+      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  std::sort(control_edges->begin(), control_edges->end());
+  if (n->op_def().is_commutative()) {
+    // For commutative inputs, we sort the input by the input Node*
+    // to get a canonical ordering (so that add(a,b) and add(b, a) will
+    // hash to the same value if is_commutative is true for 'add').
+    std::sort(in->begin(), in->end());
+  }
+}
+
+void MklLayoutRewritePass::GetNodesProducingTFTensorList(
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If input node 'n' is just producing a single tensor at
+    // output slot 'slot' then we just add that single node.
+    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
+                                                 Node** out, Node* orig_node) {
+  // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
+  // dummy Mkl tensor. 8 = 2*size_t.
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           8);
+  TensorShape dummy_shape({8});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+               .Attr("value", proto)
+               .Attr("dtype", dt)
+               .Device(orig_node->def().device())  // We place this node on
+                                                   // the same device as the
+                                                   // device of the original
+                                                   // node.
+               .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::GetNodesProducingMklTensorList(
+    std::unique_ptr<Graph>* g,
+    Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If 'n' is producing a single tensor, then create a single Mkl tensor
+    // node.
+    Node* mkl_node = nullptr;
+    int mkl_node_output_slot = 0;
+    GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
+                              &mkl_node_output_slot);
+    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
+                                                mkl_node_output_slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// Get an input node that will feed Mkl tensor to the new
+// node that we are constructing. An input node could be (1) 'n'
+// if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+// if 'n' is not an Mkl layer.
+void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
+    Node* orig_node, Node* n,
+    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+  CHECK_NOTNULL(n);
+  CHECK_NOTNULL(mkl_node);
+  CHECK_NOTNULL(mkl_node_output_slot);
+
+  // If this is an MKL op, then it will create extra output for MKL layout.
+  DataType T;
+  if (GetNodeAttr(n->def(), "T", &T).ok() &&
+      mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    // If this is an MKL op, then it will generate an edge that will receive
+    // Mkl tensor from a node.
+    // output slot number for Mkl tensor would be N+slot number of TensorFlow
+    // tensor, where N is total number of TensorFlow tensors.
+    *mkl_node = n;
+    *mkl_node_output_slot =
+        GetTensorMetaDataIndex(n_output_slot, n->num_outputs());
+  } else {
+    // If we have not visited the node and rewritten it, then we need
+    // to create a dummy node that will feed a dummy Mkl tensor to this node.
+    // DummyMklTensor node has no input and generates only 1 output
+    // (dummy Mkl tensor) as output slot number 0.
+    GetDummyMklTensorNode(g, mkl_node, orig_node);
+    CHECK_NOTNULL(*mkl_node);
+    *mkl_node_output_slot = 0;
+  }
+}
+
+int MklLayoutRewritePass::SetUpContiguousInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node,
+    std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+    bool are_workspace_tensors_available) {
+  CHECK_NOTNULL(workspace_tensors);
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+
+  // TODO(nhasabni): Temporary solution to connect filter input of
+  // BackpropInput with the converted filter from Conv2D.
+  bool do_connect_conv2d_backprop_input_filter = false;
+  Node* conv2d_node = nullptr;
+  // Filter node is 2nd input (slot index 1) of Conv2D.
+  int kConv2DFilterInputSlotIdx = 1;
+  int kConv2DBackpropInputFilterInputSlotIdx = 1;
+  int kConv2DFilterOutputSlotIdx = 1;
+  if (old_node->type_string() == csinfo_.conv2d_grad_input) {
+    // We need to find Conv2D node from Conv2DBackpropInput.
+    // For that let's first find filter node that is 2nd input (slot 1)
+    // of BackpropInput.
+    Node* filter_node = nullptr;
+    old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node);
+    CHECK_NOTNULL(filter_node);
+
+    // Now check which nodes receive from filter_node. Filter feeds as
+    // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
+    for (const Edge* e : filter_node->out_edges()) {
+      if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
+           e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) &&
+          e->dst_input() == kConv2DFilterInputSlotIdx
+          /* filter is 2nd input of Conv2D and _MklConv2D. */) {
+        if (conv2d_node != nullptr) {
+          VLOG(1) << "MklLayoutRewritePass: unusual case of same filter"
+                  << " feeding multiple Conv2D nodes: "
+                  << filter_node->DebugString();
+          // We will not connect filter input of Conv2DBackpropInput
+          // to be safe here.
+          do_connect_conv2d_backprop_input_filter = false;
+          break;
+        } else {
+          conv2d_node = e->dst();
+          do_connect_conv2d_backprop_input_filter = true;
+        }
+      }
+    }
+  }
+
+  // Number of input slots to original op
+  // Input slots are represented by .Input() calls in REGISTER_OP.
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  // Actual number of inputs can be greater than or equal to number
+  // of Input slots because inputs of type list could be unfolded.
+  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
+  int nn_slot_idx = 0;  // slot index for inputs of new node
+
+  // Let's copy all inputs (TF tensors) of original node to new node.
+  int iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
+                                    &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        nb->Input(conv2d_node, kConv2DFilterOutputSlotIdx);
+      } else {
+        nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
+      }
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Tensorflow tensor for
+  // workspace here because Tensorflow tensor for workspace is the
+  // last tensor in the list of Tensorflow tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Tensorflow tensor
+    nb->Input((*workspace_tensors)[0].node, (*workspace_tensors)[0].index);
+    nn_slot_idx++;
+  }
+
+  // Let's now setup all Mkl inputs to a new node.
+  // Number of Mkl inputs must be same as number of TF inputs.
+  iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                     N, &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      Node* mkl_node = nullptr;
+      int mkl_node_output_slot = 0;
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        GetNodeProducingMklTensor(g, old_node, conv2d_node,
+                                  kConv2DFilterOutputSlotIdx, &mkl_node,
+                                  &mkl_node_output_slot);
+      } else {
+        GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
+                                  old_node_inputs[iidx].second, &mkl_node,
+                                  &mkl_node_output_slot);
+      }
+      nb->Input(mkl_node, mkl_node_output_slot);
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Mkl tensor for
+  // workspace here because Mkl tensor for workspace is the
+  // last tensor in the list of Mkl tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Mkl tensor
+    nb->Input((*workspace_tensors)[1].node, (*workspace_tensors)[1].index);
+    nn_slot_idx++;
+  }
+
+  return nn_slot_idx;
+}
+
+Status MklLayoutRewritePass::SetUpInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node) {
+  // Let's check if we need to add workspace tensors for this node.
+  // We add workspace edge only for MaxPool, LRN and BatchNorm.
+  std::vector<NodeBuilder::NodeOut> workspace_tensors;
+  bool are_workspace_tensors_available = false;
+  AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
+                           &are_workspace_tensors_available);
+
+  int new_node_input_slots = 0;
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // TODO(nhasabni): implement this function just for same of completion.
+    // We do not use interleaved ordering right now.
+    return Status(
+        error::Code::UNIMPLEMENTED,
+        "Interleaved ordering of tensors is currently not supported.");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    new_node_input_slots = SetUpContiguousInputs(
+        g, old_node_inputs, nb, old_node, &workspace_tensors,
+        are_workspace_tensors_available);
+  }
+
+  // Sanity check
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  if (!are_workspace_tensors_available) {
+    // If we are not adding workspace tensors for this op, then the total
+    // number of input slots to the new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors.
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2);
+  } else {
+    // If we are adding workspace tensors for this op, then the total
+    // The total number of input slots to new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors plus 2
+    // (for workspace Tensorflow tensor and workspace Mkl tensor).
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2 + 2);
+  }
+
+  return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to workspace pass
+//////////////////////////////////////////////////////////////////////////
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
+    std::unique_ptr<Graph>* g, Node** out, Node* orig_node) {
+  // We use a tensor of shape {1} and value 0 to represent
+  // dummy float tensor. We need this as a dummy workspace tensor.
+  // Workspace tensor has type float.
+  const DataType dt = DataTypeToEnum<float>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  float zero[1] = {0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           4);
+  TensorShape dummy_shape({1});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                .Attr("value", proto)
+                .Attr("dtype", dt)
+                .Device(orig_node->def().device())  // We place this node on
+                                                    // same the device as the
+                                                    // device of the original
+                                                    // node.
+                .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
+    std::unique_ptr<Graph>* g, Node* orig_node, NodeBuilder* nb,
+    std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
+  bool workspace_edge_added = false;  // Default initializer
+  CHECK_NOTNULL(are_ws_tensors_added);
+  *are_ws_tensors_added = false;  // Default initializer
+
+  DataType T;
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  for (auto ws : wsinfo_) {
+    if (orig_node->type_string() == ws.fwd_op &&
+        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
+          orig_node->type_string()), T)) {
+      // If this op is a fwd op, then we need to check if there is an
+      // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
+      // an edge, then we just add an attribute on this node for setting
+      // workspace_passed to true. We don't add actual workspace edge
+      // in this node. Actual workspace edge gets added in the backward
+      // op for this node.
+      for (const Edge* e : orig_node->out_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            e->dst()->type_string() == ws.bwd_op &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      if (!workspace_edge_added) {
+        // If we are here, then we did not find backward operator for this
+        // node.
+        nb->Attr("workspace_enabled", false);
+      }
+    } else if (orig_node->type_string() == ws.bwd_op &&
+               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
+                                          orig_node->type_string()), T)) {
+      // If this op is a bwd op, then we need to add workspace edge and
+      // it's Mkl tensor edge between its corresponding fwd op and this
+      // op. Corresponding fwd op is specified in 'fwd_op' field of
+      // workspace info. fwd_slot and bwd_slot in workspace info specify
+      // an edge between which slots connect forward and backward op.
+      // Once all these criteria match, we add a workspace edge between
+      // ws_fwd_slot and ws_bwd_slot. Its corresponding Mkl tensor is
+      // determined by interleaved/contiguous ordering. Function
+      // DataIndexToMetaDataIndex tells us the location of Mkl tensor
+      // from the location of the Tensorflow tensor.
+      for (const Edge* e : orig_node->in_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            // We would have rewritten the forward op, so we need to use
+            // GetMklOpName call to get its Mkl name.
+            e->src()->type_string() == mkl_op_registry::GetMklOpName(
+                                                          ws.fwd_op) &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          CHECK_NOTNULL(ws_tensors);
+          // Add workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
+          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(
+              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
+                                                 e->src()->num_outputs())));
+          *are_ws_tensors_added = true;
+          // In terms of input ordering, we add these calls to add Input
+          // here because workspace edge (and its Mkl tensor) is the last
+          // edge in the fwdop and bwdop. So all inputs before workspace
+          // tensor have been added by SetUpInputs function.
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      // If we are here means we did not find fwd op that feeds to this
+      // bwd op. So in this case, we need to generate dummy tensors for
+      // workspace input and Mkl tensor for workspace, and set
+      // workspace_enabled to false.
+      if (!workspace_edge_added) {
+        nb->Attr("workspace_enabled", false);
+        Node* dmt_ws = nullptr;      // Dummy tensor for workspace
+        Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
+        GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
+        GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
+        CHECK_NOTNULL(dmt_ws);
+        CHECK_NOTNULL(dmt_mkl_ws);
+        CHECK_NOTNULL(ws_tensors);
+        // We add dummy tensor as workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
+        // We add dummy tensor as Mkl tensor for workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_mkl_ws, 0));
+        *are_ws_tensors_added = true;
+        VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
+                << orig_node->type_string();
+      }
+    } else {
+      // If this node does not match any workspace info, then we do not
+      // do anything special for workspace propagation for it.
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Op-specific functions to copy attributes from old node to new node
+//////////////////////////////////////////////////////////////////////////
+
+void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+}
+
+void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
+                                         NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
+                                                NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  std::vector<int32> strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
+                                        NodeBuilder* nb) {
+  DataType T;
+  int depth_radius;
+  float bias;
+  float alpha;
+  float beta;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("depth_radius", depth_radius);
+  nb->Attr("bias", bias);
+  nb->Attr("alpha", alpha);
+  nb->Attr("beta", beta);
+}
+
+void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
+                                            NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> ksize, strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("ksize", ksize);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  DataType Tshape;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("Tshape", Tshape);
+}
+
+void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
+                                          NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  int num_split;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_split", num_split);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+  int N;
+  DataType tidx;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+  nb->Attr("Tidx", tidx);
+}
+
+void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
+                                                   NodeBuilder* nb) {
+  DataType T;
+  float epsilon;
+  string data_format;
+  bool is_training;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("epsilon", epsilon);
+  nb->Attr("data_format", data_format);
+  nb->Attr("is_training", is_training);
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to node merge pass
+//////////////////////////////////////////////////////////////////////////
+
+Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
+  // TODO(nhasabni) Add check for type of node similar to CheckForNodeRewrite
+  // once we support BiasAddGrad as Mkl layer.
+
+  // Search for all matching mergeinfo.
+  // We allow more than one match for extensibility.
+  std::vector<const MergeInfo*> matching_mi;
+  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
+    if (a->type_string() == mi->op1 || a->type_string() == mi->op2) {
+      matching_mi.push_back(&*mi);
+    }
+  }
+
+  for (const MergeInfo* mi : matching_mi) {
+    // Get the operand with which 'a' can be merged.
+    Node* b = nullptr;
+    if ((b = mi->get_node_to_be_merged(a)) == nullptr) {
+      continue;
+    }
+
+    // Get the control edges and input of node
+    const int N_in = a->num_inputs();
+    gtl::InlinedVector<Node*, 4> a_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
+    FillInputs(a, &a_control_edges, &a_in);
+
+    const int B_in = b->num_inputs();
+    gtl::InlinedVector<Node*, 4> b_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(B_in);
+    FillInputs(b, &b_control_edges, &b_in);
+
+    // Shouldn't merge if a and b have different control edges.
+    if (a_control_edges != b_control_edges) {
+      continue;
+    } else {
+      // We found a match.
+      return b;
+    }
+  }
+
+  return nullptr;
+}
+
+Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
+                                                    Node* m, Node* n) {
+  CHECK_EQ(((m->type_string() == csinfo_.bias_add &&
+             n->type_string() == csinfo_.conv2d)) ||
+           ((n->type_string() == csinfo_.bias_add &&
+             m->type_string() == csinfo_.conv2d)), true);
+
+  // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd,
+  // BiasAdd is successor node, and Conv2D predecessor node.
+  Node* pred = m->type_string() == csinfo_.bias_add ? n : m;
+  Node* succ = m->type_string() == csinfo_.bias_add ? m : n;
+
+  // 1. Get all attributes from input nodes.
+  DataType T_pred, T_succ;
+  string padding;
+  std::vector<int32> strides;
+  string data_format_pred, data_format_succ;
+  bool use_cudnn_on_gnu;
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+  TF_CHECK_OK(
+      GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  // We check to ensure that data formats of both succ and pred are same.
+  // We expect them to be same, so we can enforce this as assert.
+  // But assert can be too strict, so we enforce this as a check.
+  // If the check fails, then we do not merge two nodes.
+  // We also do same check for devices.
+  if (data_format_pred != data_format_succ || T_pred != T_succ ||
+      pred->assigned_device_name() != succ->assigned_device_name() ||
+      pred->def().device() != succ->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "data_format or T attribute or devices of Conv2D and "
+                  "BiasAdd do not match. Will skip node merge optimization");
+  }
+
+  const int succ_num = succ->num_inputs();
+  gtl::InlinedVector<Node*, 4> succ_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
+  FillInputs(succ, &succ_control_edges, &succ_in);
+
+  const int pred_num = pred->num_inputs();
+  gtl::InlinedVector<Node*, 4> pred_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
+  FillInputs(pred, &pred_control_edges, &pred_in);
+
+  // We need to ensure that Conv2D only feeds to BiasAdd (some other operator is
+  // not expecting output of Conv2D). If this is not the case, then we cannot
+  // merge Conv2D with BiasAdd.
+  const int kFirstOutputSlot = 0;
+  for (const Edge* e : pred->out_edges()) {
+    if (e->src_output() == kFirstOutputSlot && e->dst() != succ) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "Conv2D does not feed to BiasAdd, or "
+                    "it feeds BiasAdd but has multiple outputs. "
+                    "Will skip node merge optimization");
+    }
+  }
+
+  // 2. Get inputs from both the nodes.
+  // Find the 2 inputs from the conv and the bias from the add Bias.
+  // Get operand 0, 1 of conv2D.
+  CHECK_EQ(pred->in_edges().size(), 2);  // Conv2D must have 2 inputs.
+  // Get operand 1 of add_bias
+  // BiasAdd must have 2 inputs: Conv, bias
+  CHECK_EQ(succ->in_edges().size(), 2);
+
+  // We will use the node name of BiasAdd as the name of new node
+  // Build new node. We use same name as original node, but change the op
+  // name.
+  NodeBuilder nb(succ->name(), csinfo_.conv2d_with_bias);
+  nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
+  // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
+  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 of Conv2D
+  // In1 of BiasAdd is same as output of Conv2D.
+  nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
+
+  // Copy attributes from Conv2D to Conv2DWithBias.
+  CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(succ->def().device());
+
+  // Create node.
+  Node* new_node;
+  nb.Finalize(&**g, &new_node);
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+  // node are already copied in BuildNode. We handle control edges now.
+  for (const Edge* e : pred->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+  for (const Edge* e : succ->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'pred' node.
+  for (const Edge* e : pred->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'succ' node.
+  for (const Edge* e : succ->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
+      // output (at slot 0).
+      const int kConv2DWithBiasOutputSlot = 0;
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot,
+                                    e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use pred or succ as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(pred->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+          << ", and node: " << succ->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(succ);
+  (*g)->RemoveNode(pred);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
+    std::unique_ptr<Graph>* g, Node* m, Node* n) {
+  CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
+             n->type_string() == csinfo_.conv2d_grad_filter)) ||
+           ((n->type_string() == csinfo_.bias_add_grad &&
+             m->type_string() == csinfo_.conv2d_grad_filter)), true);
+
+  // If 'm' is BiasAddGrad, then 'n' is BackpropFilter.
+  Node* badd = m->type_string() == csinfo_.bias_add_grad ? m : n;
+  Node* fltr = m->type_string() == csinfo_.bias_add_grad ? n : m;
+
+  // Sanity check for attributes from input nodes.
+  DataType T_b, T_f;
+  string data_format_b, data_format_f;
+  TF_CHECK_OK(GetNodeAttr(badd->def(), "T", &T_b));
+  TF_CHECK_OK(GetNodeAttr(fltr->def(), "T", &T_f));
+  TF_CHECK_OK(GetNodeAttr(badd->def(), "data_format", &data_format_b));
+  TF_CHECK_OK(GetNodeAttr(fltr->def(), "data_format", &data_format_f));
+  if (data_format_b != data_format_f || T_b != T_f ||
+      badd->assigned_device_name() != fltr->assigned_device_name() ||
+      badd->def().device() != fltr->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "data_format or T attribute or devices of "
+                  "Conv2DBackpropFilter and BiasAddGrad do not match. "
+                  "Will skip node merge optimization");
+  }
+
+  // We will use the node name of Conv2DBackpropFilter as the name of new node.
+  // This is because BackpropFilterWithBias is going to emit bias output also.
+  NodeBuilder nb(fltr->name(), csinfo_.conv2d_grad_filter_with_bias);
+  // Since Conv2DBackpropFilterWithBias has same number of inputs as
+  // Conv2DBackpropFilter, we can just copy input edges directly. We dont need
+  // to copy any data input of BiasAddGrad because that input also goes to
+  // Conv2DBackpropFilter.
+  const int fltr_ins = fltr->num_inputs();
+  gtl::InlinedVector<Node*, 4> fltr_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> fltr_in_edges(fltr_ins);
+  FillInputs(fltr, &fltr_control_edges, &fltr_in_edges);
+  for (int idx = 0; idx < fltr_ins; idx++) {
+    nb.Input(fltr_in_edges[idx].first, fltr_in_edges[idx].second);
+  }
+
+  // Copy attributes from Conv2DBackpropFilter.
+  CopyAttrsConv2D(const_cast<const Node*>(fltr), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(fltr->def().device());
+
+  // Create node.
+  Node* new_node;
+  nb.Finalize(&**g, &new_node);
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from BiasAddGrad node and Conv2DBackpropFilter node to
+  // new 'new_node' node are already copied in BuildNode. We handle control
+  // edges now.
+  for (const Edge* e : badd->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+  for (const Edge* e : fltr->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'badd' node.
+  // Conv2DBackpropFilter has 1 output -- filter_grad.
+  // Conv2DBackpropFilterWithBias has 2 outputs -- filter_grad and
+  // bias_grad. But filter_grad is at same slot number (0) in both the
+  // nodes. bias_grad is at slot number 1 in Conv2DBackpropFilterWithBias, while
+  // it is at slot number 0 in BiasAddGrad.
+  const int kMergedNodeFilterGradOutputIdx = 0;
+  const int kMergedNodeBiasGradOutputIdx = 1;
+
+  for (const Edge* e : badd->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx,
+                                  e->dst(), e->dst_input()));
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'fltr' node.
+  for (const Edge* e : fltr->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx,
+                                  e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use badd or fltr as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(badd->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << badd->DebugString()
+          << ", and node: " << fltr->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(badd);
+  (*g)->RemoveNode(fltr);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
+                                       Node* n) {
+  CHECK_NOTNULL(m);
+  CHECK_NOTNULL(n);
+
+  if (((m->type_string() == csinfo_.bias_add &&
+        n->type_string() == csinfo_.conv2d)) ||
+      ((n->type_string() == csinfo_.bias_add &&
+        m->type_string() == csinfo_.conv2d))) {
+    return this->MergeConv2DWithBiasAdd(g, m, n);
+  }
+
+  if (((m->type_string() == csinfo_.bias_add_grad &&
+        n->type_string() == csinfo_.conv2d_grad_filter)) ||
+      ((n->type_string() == csinfo_.bias_add_grad &&
+        m->type_string() == csinfo_.conv2d_grad_filter))) {
+    return this->MergeConv2DBackpropFilterWithBiasAddGrad(g, m, n);
+  }
+
+  return Status(error::Code::UNIMPLEMENTED,
+                "Unimplemented case for node merge optimization.");
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node rewrite
+//////////////////////////////////////////////////////////////////////////
+
+Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
+                                         Node* orig_node,
+                                         const RewriteInfo* ri) {
+  CHECK_NOTNULL(ri);
+  CHECK_NOTNULL(orig_node);
+
+  VLOG(1) << "MklLayoutRewritePass: Original node:" << orig_node->DebugString();
+
+  // Get all inputs.
+  int num_inputs = orig_node->in_edges().size();
+
+  // Drop count for control edges from inputs
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      num_inputs--;
+    }
+  }
+
+  gtl::InlinedVector<Node*, 4> control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
+  FillInputs(orig_node, &control_edges, &inputs);
+
+  // Build new node. We use same name as original node, but change the op name.
+  NodeBuilder nb(orig_node->name().c_str(), ri->new_name.c_str());
+  // Copy user-specified device assigned to original node to new node.
+  nb.Device(orig_node->def().device());
+  // Set up new inputs to the rewritten node.
+  Status s = SetUpInputs(g, inputs, &nb, orig_node);
+  if (s != Status::OK()) {
+    return s;
+  }
+
+  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
+  // Set the Mkl layer label for this op.
+  nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
+
+  // Finalize graph and get new node.
+  Node* new_node = nullptr;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'orig_node' node to new 'new_node' node are
+  // already copied in BuildNode. We need to handle control edges now.
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Copy outgoing edges from 'orig_node' node to new
+  // 'new_node' node, since the output also follows same ordering among
+  // Tensorflow tensors and Mkl tensors. We need to connect Tensorflow
+  // tensors appropriately. Specifically, nth output of the original node
+  // will become 2*nth output of the Mkl node for the interleaved ordering
+  // of the tensors. For the contiguous ordering of the tensors, it will be n.
+  // GetTensorDataIndex provides this mapping function.
+  for (const Edge* e : orig_node->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
+                            e->src()->num_outputs()),
+                    e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy the runtime device assigned from original code to new node.
+  new_node->set_assigned_device_name(orig_node->assigned_device_name());
+
+  // Delete original node and mark new node as rewritten.
+  (*g)->RemoveNode(orig_node);
+
+  VLOG(1) << "MklLayoutRewritePass: New node:" << new_node->DebugString();
+  return Status::OK();
+}
+
+const MklLayoutRewritePass::RewriteInfo*
+MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
+  CHECK_NOTNULL(n);
+
+  // First check if node along with its type is supported by MKL layer.
+  // We do not want to rewrite an op into Mkl op if types are not supported.
+  // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
+  // MklRelu if type is INT32.
+  DataType T;
+  if (!GetNodeAttr(n->def(), "T", &T).ok()) {
+    return nullptr;
+  }
+
+  // We make an exception for __MklDummyConv2DWithBias and
+  // __MklConv2DBackpropFilterWithBias since their names do not match Mkl node
+  // names.
+  if (n->type_string() != csinfo_.conv2d_with_bias &&
+      n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
+                                        n->type_string()), T)) {
+      return nullptr;
+  }
+
+  // For elementwise node, we reuse the Eigen implementation and pass the MKL
+  // metadata tensor through so we can avoid conversions. However, if all
+  // incoming edges are in TF format, we don't need all this overhead, so
+  // replace the elementwise node only if at least one of its parents is a MKL
+  // node.
+  //
+  // Identity nodes can also skip replacement if they are not being served by
+  // any MKL nodes.
+  //
+  // TODO(vrane): Add implementation for element-wise ops that doesn't reuse
+  // eigen code to reduce cross-library dependency.
+  VLOG(1) << "ELEMENTWISE: checking op: " << n->type_string();
+  if (mkl_op_registry::IsMklElementWiseOp(
+        mkl_op_registry::GetMklOpName(n->type_string()), T) ||
+      n->type_string().find("Identity") != string::npos) {
+    VLOG(1) << "ELEMENTWISE: op is elementwise: " << n->type_string();
+    bool incoming_mkl_edge = false;
+    int num_parent = 0;
+    for (auto parent : n->in_edges()) {
+      if (mkl_op_registry::IsMklOp(parent->src()->type_string(), T)) {
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is MKL op: "
+                << parent->src()->type_string();
+        incoming_mkl_edge = true;
+        break;
+      } else {
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is NON-MKL op: "
+                << parent->src()->type_string();
+      }
+    }
+    if (incoming_mkl_edge == false) {
+      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which has no MKL "
+                 "parents.";
+      return nullptr;
+    } else {
+      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string() <<
+        " which has MKL parents";
+    }
+  }
+
+  // We now check if rewrite rule applies for this op. If rewrite rule passes
+  // for this op, then we rewrite it to Mkl op.
+  // Find matching RewriteInfo and then check that rewrite rule applies.
+  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
+    if (n->type_string().compare(ri->name) == 0 &&
+        ri->rewrite_rule(n)) {
+      return &*ri;
+    }
+  }
+
+  // Else return not found.
+  return nullptr;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+///////////////////////////////////////////////////////////////////////////////
+
+bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
+  bool result = false;
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before running MklLayoutRewritePass", &**g);
+
+  std::vector<Node*> order;
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    Node* m = nullptr;
+    if ((m = CheckForNodeMerge(n)) != nullptr && CanOpRunOnCPUDevice(m)) {
+      // Check if the node 'n' can be merged with any other node. If it can
+      // be 'm' contains the node with which it can be merged.
+      string n1_name = n->name();
+      string n2_name = m->name();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
+              << n2_name << " for merging";
+
+      if (MergeNode(g, n, m) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
+                << n2_name;
+        result = true;
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
+
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    const RewriteInfo* ri = nullptr;
+    // We will first search if node is to be rewritten.
+    if ((ri = CheckForNodeRewrite(n)) != nullptr) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled node " << node_name
+              << " with op " << op_name << " for rewrite using"
+              << " layout optimization.";
+
+      if (RewriteNode(g, n, ri) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
+                << " with op " << op_name << " for Mkl layout optimization.";
+        result = true;
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
+
+  return result;
+}
+
+bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
+  return MklLayoutRewritePass().RunPass(g);
+}
+
+Status MklLayoutRewritePass::Run(
+  const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
+    return Status::OK();
+  }
+
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of a graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of a graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklLayoutRewritePassGroup !=
+      OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, a graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
+
+  return Status::OK();
+}
+#endif  // INTEL_MKL_DNN
 }  // namespace tensorflow
 
 #endif

diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index abc63e4..75f7ca2 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc

@@ -37,6 +37,9 @@
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
+
+#ifndef INTEL_MKL_DNN
+
 namespace {
 
 const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
@@ -1881,6 +1884,1627 @@
 BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
+
+#else  // INTEL_MKL_DNN
+
+namespace {
+
+const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/device:GPU:0";
+
+static void InitGraph(const string& s, Graph* graph,
+                      const string& device = kCPUDevice) {
+  GraphDef graph_def;
+
+  auto parser = protobuf::TextFormat::Parser();
+  //  parser.AllowRelaxedWhitespace(true);
+  CHECK(parser.MergeFromString(s, &graph_def)) << s;
+  GraphConstructorOptions opts;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+
+  for (Node* node : graph->nodes()) {
+    node->set_assigned_device_name(device);
+  }
+}
+
+class MklLayoutPassTest : public ::testing::Test {
+ public:
+  MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+
+  void InitGraph(const string& s, const string& device = kCPUDevice) {
+    ::tensorflow::InitGraph(s, &graph_, device);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoMklLayoutOptimizationPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    RunMklLayoutRewritePass(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MKL layout rewrite pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
+REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2").Output("o: uint8")
+                        .Output("o1: uint8").SetIsStateful();
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to node merge optiimization
+/////////////////////////////////////////////////////////////////////
+
+TEST_F(MklLayoutPassTest, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Zeta);D(Zeta)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// Test set 1: Conv2D + AddBias
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Zeta)|A->E;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+
+// Graph contains only Conv2D, no AddBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);DMT/_0(Const);DMT/_1(Const)|"
+            "A->C;A:control->DMT/_0:control;A:control->DMT/_1:control;B->C:1;"
+            "DMT/_0->C:2;DMT/_1->C:3");
+}
+
+// Conv2D output does not go to BiasAdd.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }");  // Output of _MklConv2D does not go to BiasAdd.
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(Input);F(BiasAdd)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;D->F;DMT/_0->C:2;DMT/_1->C:3;"
+            "E->F:1");
+}
+
+// Conv2D has two outgoing edges: BiasAdd and some other dummy node (Zeta).
+// Merge should not be done in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }"  // Conv2D has two outputs.
+                              // No merge should happen.
+      "node { name: 'G' op: 'Zeta'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(Input);F(BiasAdd);G(Zeta)|A->C;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;B->C:1;C->G;"
+            "D->F;DMT/_0->C:2;DMT/_1->C:3;E->F:1;E->G:1");
+}
+
+// data_format attribute value mismatch. Merge should not be done
+// in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHCW' } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(BiasAdd)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->E;D->E:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+// Test set 2: BiasAddGrad + Conv2DBackpropFilter fusion tests
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilterWithBias);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion in the presence of BackpropFilter. But nodes do not match
+// criteria for rewrite. So rewrite should not happen. 3rd input of
+// Conv2DBackpropFilter is different than input to BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['A'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilter);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(BiasAddGrad)|A->D;A->E;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion, but nodes do not match criteria for fusion.
+// Different input formats.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " input: ['A'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilter);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(BiasAddGrad)|A->D;A->E;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion in the presence of BackpropFilter only. Fusion is done
+// before node rewrite. Check this ordering.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
+            "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
+            "O->G:5");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Y=Zeta(E,X);
+// G=Conv2DBackpropInput(F,B,E)
+// This is a case of node rewrite followed by node merge followed by connecting
+// filter output of Conv2DWithBias to filter input of Conv2DBackpropInput.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'Y' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'X']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['F', 'B', 'E']}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['G', 'X']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2DWithBias);F(Int32Input);"
+            "G(_MklConv2DBackpropInput);X(Input);Y(Zeta);Z(Zeta)|"
+            "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;"
+            "DMT/_1->E:4;DMT/_2->E:5;DMT/_3->G:3;E->G:2;E->Y;E:1->G:1;E:2->G:5;"
+            "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node to Mkl node
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op; No Mkl layer on the input and on the output.
+// We will generate dummy Mkl tensor as 2nd input of Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
+// have 2 outputs, both of which will be inputs to next Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->C;A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
+            "C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
+}
+
+// Conv2D with INT32 which is not supported by Mkl
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
+  InitGraph(
+      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(HalfInput);B(HalfInput);C(Conv2D);D(Zeta)|"
+            "A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Polygamma'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
+            "M->C:2;N->C:3");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Concat with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
+            "G:control->DMT/_4:control;H->I:1");
+}
+
+// Concat with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+            "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
+            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
+}
+
+// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D:2;B->D;B:1->D:1;"
+            "B:control->DMT/_0:control;B:control->DMT/_1:control;"
+            "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// ConcatV2 with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;"
+            "F:2->H:4;G->H:2;H->I:1");
+}
+
+// ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+            "H(_MklConcatV2);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;"
+            "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
+            "G->H:2;H->I:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+            "DMT/_1->C:2");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Int32Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'I' op: 'Int32Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['I', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
+            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
+            "I:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNormGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node for workspace edges
+/////////////////////////////////////////////////////////////////////
+
+/* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B'] }"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B', 'C', 'D'] }"
+      "node { name: 'F' op: 'Input'}"
+      "node { name: 'G' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['E', 'F', 'B'] }"
+      "node { name: 'H' op: 'Input'}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['H', 'G'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
+      "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
+      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
+      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
+      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
+}
+
+/* Test LRN->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, LRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|"
+            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+/* Test LRN->LRNGrad negative case, where single LRN feeds
+   2 LRNGrad nodes at different slots. */
+TEST_F(MklLayoutPassTest, LRN_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['E', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
+            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Zeta)|A->B;"
+            "A:control->DMT/_0:control;B->E:2;"
+            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
+            "C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "C:control->DMT/_3:control;C:control->DMT/_4:control;"
+            "C:control->DMT/_5:control;C:control->DMT/_6:control;"
+            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
+            "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
+}
+
+/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|"
+            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
+}
+
+// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPool node but workspace edges will not
+// be present.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+// Test MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
+// its Mkl part, we will generate dummy tensor.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op on GPU device
+// No rewrite should happen
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
+            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
+            "M->D:3;N->D:4;O->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
+            "B->D:1;B:1->D:2;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
+            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);E(Input);"
+            "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
+            "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Zeta' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      RunMklLayoutRewritePass(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+
+#endif  // INTEL_MKL_DNN
+
 }  // namespace tensorflow
 
 #endif /* INTEL_MKL */

diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 67d603d..bacf3e7 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc

@@ -13,6 +13,7 @@
 limitations under the License.
 ==============================================================================*/
 
+#include <iostream>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -76,7 +77,7 @@
       strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_),
                          "]");
     }
-    LOG(INFO) << msg;
+    std::cerr << msg << std::endl;
   }
 
  private:

diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 935eb81d..9aabbbd 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc

@@ -19,7 +19,6 @@
 #define EIGEN_USE_THREADS
 
 #include <numeric>
-
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -29,10 +28,17 @@
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-namespace tensorflow {
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::stream;
+using mkldnn::sum;
+#endif
 
+namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
@@ -41,17 +47,18 @@
   void Compute(OpKernelContext* ctx) override {
     const int num = ctx->num_inputs();
     OP_REQUIRES(ctx, num / 2 == 2,
-                errors::InvalidArgument("Only additions of two arguments "
+                errors::InvalidArgument("Only additions of two tensors "
                                         "supported by MKL. Num inputs: ",
                                         num));
 
     MklAddNOpContext mkl_context;
-    const Tensor& input0 = MklGetInput(ctx, 0);
-    GetMklShape(ctx, 0, &(mkl_context.input1_shape));
+    size_t src1_idx = 0, src2_idx = 1;
+    const Tensor& input0 = MklGetInput(ctx, src1_idx);
+    GetMklShape(ctx, src1_idx, &(mkl_context.input1_shape));
     bool input1_in_mkl_format = mkl_context.input1_shape.IsMklTensor();
 
-    const Tensor& input1 = MklGetInput(ctx, 1);
-    GetMklShape(ctx, 1, &(mkl_context.input2_shape));
+    const Tensor& input1 = MklGetInput(ctx, src2_idx);
+    GetMklShape(ctx, src2_idx, &(mkl_context.input2_shape));
     bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();
 
     // handle the case of a scalar
@@ -59,13 +66,12 @@
       const TensorShape& o_shape = input0.shape();
       Tensor* out_tensor = nullptr;
       mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
+      AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
                                 mkl_context.output_shape);
       float user_i1 = (input0.scalar<T>()());
-      ;
       float user_i2 = (input1.scalar<T>()());
-      ;
-      out_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
+      out_tensor->scalar<T>()() =
+          std::plus<float>{}(user_i1, user_i2);
       return;
     }
 
@@ -82,8 +88,8 @@
       if (o_shape.num_elements() == 0) {
         Tensor* out_tensor = nullptr;
         mkl_context.output_shape.SetMklTensor(false);
-        AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
-                                  mkl_context.output_shape);
+        AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
+                                 mkl_context.output_shape);
         return;
       }
     }
@@ -92,9 +98,9 @@
     mkl_context.in_strides = new size_t[mkl_context.in_dims];
     // Generate size, stride for input if input is in MKL format.
     if (input1_in_mkl_format || input2_in_mkl_format) {
-      const MklShape* tmp_mkl_shape = (input1_in_mkl_format)
-                                          ? &mkl_context.input1_shape
-                                          : &mkl_context.input2_shape;
+      const MklShape* tmp_mkl_shape =
+        (input1_in_mkl_format) ? &mkl_context.input1_shape :
+        &mkl_context.input2_shape;
       for (int i = 0; i < mkl_context.in_dims; i++) {
         mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
         mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
@@ -110,7 +116,6 @@
             mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
       }
     }
-
     std::vector<float> coeff(2, 1.0);
     mkl_context.MklCreateInputLayouts(ctx);
     CHECK_EQ(dnnSumCreate_F32(&mkl_context.Eltwise, mkl_context.attributes, 2,
@@ -127,7 +132,7 @@
      mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst);
 
      mkl_context.output_shape.SetTfLayout(
-         mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
      if (input1_in_mkl_format == true) {
       mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
       mkl_context.input1_shape.GetTfToMklDimMap());
@@ -139,12 +144,12 @@
                         mkl_context.output_shape.GetMklLayout())) /
                     sizeof(T));
 
-     AllocateOutputSetMklShape(ctx, 0, &output, tf_shape,
+     AllocateOutputSetMklShape(ctx, src1_idx, &output, tf_shape,
                               mkl_context.output_shape);
     } else {
      const TensorShape& o_shape = input1.shape();
      mkl_context.output_shape.SetMklTensor(false);
-     AllocateOutputSetMklShape(ctx, 0, &output, o_shape,
+     AllocateOutputSetMklShape(ctx, src1_idx, &output, o_shape,
                                 mkl_context.output_shape);
     }
 
@@ -172,16 +177,18 @@
     void MklCreateInputLayouts(OpKernelContext* context) {
       bool input1_in_mkl_format = input1_shape.IsMklTensor();
       if (!input1_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
+            E_SUCCESS);
       } else {
         lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout());
       }
 
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       if (!input2_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
+            E_SUCCESS);
       } else {
         lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout());
       }
@@ -257,8 +264,8 @@
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       dnnDelete_F32(Eltwise);
       if (!input1_in_mkl_format || !input2_in_mkl_format) {
-        delete[] in_sizes;
-        delete[] in_strides;
+         delete [] in_sizes;
+         delete [] in_strides;
       }
       if (!input1_in_mkl_format) {
          dnnLayoutDelete_F32(lt_input1);
@@ -270,6 +277,151 @@
   } MklAddNOpContext;
 };
 
+#else  // INTEL_MKL_DNN
+template <typename Device, typename T>
+class MklAddNOp : public OpKernel {
+ public:
+  ~MklAddNOp() {}
+  explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const int num = ctx->num_inputs();
+    // Only additions of 2 input tensors is supported now
+    OP_REQUIRES(ctx, num / 2 == 2,
+                errors::InvalidArgument("Only additions of two tensors "
+                                        "supported by MKL. Num inputs: ",
+                                        num));
+
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      size_t src1_idx = 0, src2_idx = 1;
+      const Tensor& src1_tensor = MklGetInput(ctx, src1_idx);
+      const Tensor& src2_tensor = MklGetInput(ctx, src2_idx);
+
+      MklDnnShape src1_mkl_shape, src2_mkl_shape;
+      GetMklShape(ctx, src1_idx, &src1_mkl_shape);
+      GetMklShape(ctx, src2_idx, &src2_mkl_shape);
+      bool input1_in_mkl_format = src1_mkl_shape.IsMklTensor();
+      bool input2_in_mkl_format = src2_mkl_shape.IsMklTensor();
+      int src1_dims_size = input1_in_mkl_format?
+       src1_mkl_shape.GetDimension(): src1_tensor.dims();
+      int src2_dims_size = input2_in_mkl_format?
+       src2_mkl_shape.GetDimension(): src2_tensor.dims();
+
+      if (!input1_in_mkl_format && src1_dims_size == 0) {
+         Tensor* dst_tensor = nullptr;
+         MklShape mkl_shape_dst;
+         mkl_shape_dst.SetMklTensor(false);
+         AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+         src1_tensor.shape(), mkl_shape_dst);
+         float user_i1 = (src1_tensor.scalar<T>()());
+         float user_i2 = (src2_tensor.scalar<T>()());
+         dst_tensor->scalar<T>()() =
+           std::plus<float>{}(user_i1, user_i2);
+         return;
+       }
+
+      // If there is nothing to compute, return.
+      if (!input1_in_mkl_format && !input2_in_mkl_format) {
+        if (src1_tensor.shape().num_elements() == 0) {
+           Tensor* dst_tensor = nullptr;
+           MklShape mkl_shape_dst;
+           mkl_shape_dst.SetMklTensor(false);
+           AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+           src1_tensor.shape(), mkl_shape_dst);
+           return;
+        }
+      }
+
+      // element-wise add operator for tensor input1 and tensor input2
+      std::vector<double> coeff(2, 1.0);
+      MklDnnData<T> src1(&cpu_engine);
+      MklDnnData<T> src2(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      int tmp_size = input1_in_mkl_format ? src2_dims_size: src1_dims_size;
+      memory::dims dims(tmp_size);
+      memory::dims strides(tmp_size);
+      memory::desc md1({}, memory::data_undef, memory::format_undef);
+      memory::desc md2({}, memory::data_undef, memory::format_undef);
+
+      if ( input1_in_mkl_format || input2_in_mkl_format ) {
+        if ( input1_in_mkl_format ) {
+          md1 = src1_mkl_shape.GetMklLayout();
+          md2 = md1;
+          dst.SetUsrMem(md1);
+        } else {
+          md2 = src2_mkl_shape.GetMklLayout();
+          md1 = md2;
+          dst.SetUsrMem(md2);
+        }
+      } else {
+         dims = TFShapeToMklDnnDims(src1_tensor.shape());
+         strides = CalculateTFStrides(dims);
+         md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides);
+         md2 = md1;
+         dst.SetUsrMem(dims, strides);
+      }
+
+      std::vector<memory::primitive_desc> srcs_pd;
+
+      src1.SetUsrMem(md1, &src1_tensor);
+      auto mpd1 = src1.GetUsrMemPrimDesc();
+      srcs_pd.push_back(mpd1);
+
+      src2.SetUsrMem(md2, &src2_tensor);
+      auto mpd2 = src2.GetUsrMemPrimDesc();
+      srcs_pd.push_back(mpd2);
+
+      std::vector<primitive::at> inputs;
+      inputs.push_back(src1.GetOpMem());
+      inputs.push_back(src2.GetOpMem());
+      auto output_pd = dst.GetUsrMemPrimDesc();
+      Tensor* dst_tensor = nullptr;
+      auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd);
+      auto sum_op = sum(sum_pd, inputs, dst.GetOpMem());
+      if ( input2_in_mkl_format || input1_in_mkl_format ) {
+         MklDnnShape output_mkl_shape;
+         output_mkl_shape.SetMklTensor(true);
+         output_mkl_shape.SetMklLayout(&output_pd);
+         output_mkl_shape.SetElemType(MklDnnType<T>());
+         if ( input1_in_mkl_format ) {
+          output_mkl_shape.SetTfLayout(src1_dims_size,
+          src1_mkl_shape.GetSizesAsMklDnnDims(),
+          src1_mkl_shape.GetTfDataFormat());
+         } else {
+          output_mkl_shape.SetTfLayout(src2_dims_size,
+          src2_mkl_shape.GetSizesAsMklDnnDims(),
+          src2_mkl_shape.GetTfDataFormat());
+         }
+         TensorShape output_tf_shape;
+         output_tf_shape.AddDim((output_pd.get_size() / sizeof(T))
+         + (output_pd.get_size()%sizeof(T) == 0 ? 0 : 1));
+         AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor, output_tf_shape,
+                                output_mkl_shape);
+      } else {
+         MklShape mkl_shape_dst;
+         mkl_shape_dst.SetMklTensor(false);
+         AllocateOutputSetMklShape(ctx, src1_idx,
+         &dst_tensor, src1_tensor.shape(), mkl_shape_dst);
+      }
+
+      dst.SetUsrMemDataHandle(dst_tensor);
+      std::vector<primitive> net;
+      net.push_back(sum_op);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(ctx, errors::Aborted("Operation received an exception:",
+                                            error_msg));
+    }
+  }
+};
+
+#endif
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklAddN")                          \
                               .Device(DEVICE_CPU)                   \

diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index d90baee..d751a70 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc

@@ -24,10 +24,25 @@
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::error;
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+using mkldnn::padding_kind;
+using mkldnn::engine;
+using mkldnn::prop_kind;
+using mkldnn::algorithm;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklAvgPoolingOp : public OpKernel {
  public:
@@ -132,7 +147,7 @@
         E_SUCCESS);
 
     mkl_context.MklCleanup();
-  }
+  }  // Compute
 
  private:
   typedef struct {
@@ -411,7 +426,293 @@
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-};
+};  // MklAvgPoolingGradOp
+
+
+#else  // INTEL_MKL_DNN is defined
+
+template <typename Device, typename T>
+class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
+ public:
+  explicit MklAvgPoolingOp(OpKernelConstruction* context)
+  : MklPoolingForwardOpBase<T>(context) {
+    // Workspace is an MKLDNN construct that is only used in Max Pooling.
+    // So set workspace_enabled_ to false.
+    this->workspace_enabled_ = false;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& input_tensor = MklGetInput(context,
+              this->kInputTensorIndexInput);
+      MklDnnShape dnn_shape_input;
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      this->SanityCheckInput(context, input_tensor, dnn_shape_input);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> dnn_data_input(&cpu_engine);
+      MklDnnData<T> dnn_data_output(&cpu_engine);
+
+      // initialize variables for the pooling op
+      MklPoolParameters pool_params;
+      // Get the input tensor and initialize the pooling parameters
+      this->ConfigureInput(context, dnn_shape_input,
+                          input_tensor, &pool_params,
+                          &dnn_data_input);
+      OP_REQUIRES_OK(context, context->status());
+
+      // Declare output tensor
+      Tensor* output_tensor = nullptr;
+      memory::dims output_dims_mkl_order;
+      this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+      // If input is in Mkl layout, then just get the memory format from it
+      // directly, instead of using input data_format to AvgPool.
+      if (dnn_shape_input.IsMklTensor()) {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                static_cast<memory::format>(dnn_data_input.GetUsrMemDesc()
+                    .data.format));
+
+      } else {
+          dnn_data_output.SetUsrMem(output_dims_mkl_order,
+              this->data_format_mkldnn_);
+      }
+
+        // describe the memory layout
+      dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+
+      // 3. create a pooling primitive descriptor
+      auto pool_desc = pooling_forward::desc(prop_kind::forward,
+              algorithm::pooling_avg_exclude_padding,
+              dnn_data_input.GetUsrMemDesc(),
+              dnn_data_output.GetUsrMemDesc(),
+              memory::dims({  pool_params.row_stride,
+                              pool_params.col_stride}),
+              memory::dims({  pool_params.window_rows,
+                              pool_params.window_cols}),
+              memory::dims({  static_cast<int>(pool_params.pad_top),
+                              static_cast<int>(pool_params.pad_left)}),
+              memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                              static_cast<int>(pool_params.pad_right)}),
+              TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc,
+                                                 cpu_engine);
+
+      this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order,
+                            this->data_format_mkldnn_, &output_tensor);
+      CHECK_NOTNULL(output_tensor);
+
+      OP_REQUIRES_OK(context, context->status());
+      dnn_data_output.SetUsrMemDataHandle(output_tensor);
+
+      this->PrepareAndExecuteNet(pool_prim_desc,
+                                &dnn_data_input,
+                                &dnn_data_output);
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+                        ", message: " + string(e.message) +
+                        ", in file " + string(__FILE__) + ":" +
+                        std::to_string(__LINE__);
+        OP_REQUIRES_OK(context,
+                        errors::Aborted("Operation received an exception:",
+                                         error_msg));
+    }
+  }  // Compute
+};  // MklAvgPoolingOp
+
+//-----------------------------------------------------------------------------
+
+template <class Device, class T>
+class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
+ public:
+  explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
+      : MklPoolingBackwardOpBase<T>(context) {
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape;
+      const Tensor& tensor_in_shape = MklGetInput(context,
+          kInputTensorIndexInputShape);
+      const Tensor& input_gradient_tensor = MklGetInput(context,
+          kInputTensorIndexInputGradient);
+      GetMklShape(context, kInputTensorIndexInputShape,
+            &original_input_mkl_shape);
+      GetMklShape(context, kInputTensorIndexInputGradient,
+            &input_gradient_mkl_shape);
+
+
+      SanityCheckInputs(context, tensor_in_shape,
+                        input_gradient_tensor,
+                        original_input_mkl_shape,
+                        input_gradient_mkl_shape);
+      if (!context->status().ok()) return;
+
+      // Used to allocate output_diff_src/diff_src
+      // and create pool_fwd mdm desc
+      // 0. Input("orig_input_shape: int32") //NOT a T Tensor!
+      // 1. Input("grad: T")
+
+      MklDnnData<T> input_gradient_diff_dst(&cpu_engine);
+      MklDnnData<T> output_diff_src(&cpu_engine);
+      Tensor* output_tensor_diff_src = nullptr;
+      TensorShape original_input_shape;
+      MklPoolParameters pool_params;
+      memory::dims output_dims_mkl_order, original_input_dims_nchw;
+      // Configure the original input memory descriptor
+      memory::desc original_input_md = ConfigureOriginalInput(context,
+                                      tensor_in_shape,
+                                      original_input_mkl_shape,
+                                      &original_input_dims_nchw,
+                                      &pool_params,
+                                      &original_input_shape);
+
+      // configure the original output memory descriptor
+      // by definition, the shape of the original output is the same
+      // as the shape of the gradient diff_dst
+      memory::desc original_output_md = this->ConfigureOriginalOutput(
+                pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
+
+      memory::desc target_diff_dst_md = this->ConfigureInputGradient(
+                                    input_gradient_mkl_shape,
+                                    input_gradient_tensor,
+                                    &input_gradient_diff_dst,
+                                    original_output_md);
+      // The shape of the output diff src needs to be the same shape as the
+      // original input. But we will set its format to be same as the format of
+      // input gradient. We won't use format of original input since it will
+      // always be in Tensorflow layout (given that AvgPoolGrad gets shape of
+      // the input rather than actual input).
+      output_diff_src.SetUsrMem(original_input_dims_nchw,
+                                static_cast<memory::format>(
+                                  target_diff_dst_md.data.format));
+
+      // Create the forward pooling primitive descriptor so we can reference it
+      // in the backward pooling primitive descriptor
+      auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
+              algorithm::pooling_avg_exclude_padding,
+              original_input_md,
+              original_output_md,
+              memory::dims({  pool_params.row_stride,
+                              pool_params.col_stride}),
+              memory::dims({  pool_params.window_rows,
+                              pool_params.window_cols}),
+              memory::dims({  static_cast<int>(pool_params.pad_top),
+                              static_cast<int>(pool_params.pad_left)}),
+              memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                              static_cast<int>(pool_params.pad_right)}),
+              TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc
+              = pooling_forward::primitive_desc(pool_fwd_desc,
+                                                  cpu_engine);
+
+      auto pool_bkwd_desc = pooling_backward::desc(
+              algorithm::pooling_avg_exclude_padding,
+              output_diff_src.GetUsrMemDesc(),
+              target_diff_dst_md,
+              memory::dims({  pool_params.row_stride,
+                              pool_params.col_stride}),
+              memory::dims({  pool_params.window_rows,
+                              pool_params.window_cols}),
+              memory::dims({  static_cast<int>(pool_params.pad_top),
+                              static_cast<int>(pool_params.pad_left)}),
+              memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                              static_cast<int>(pool_params.pad_right)}),
+              TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc
+                = pooling_backward::primitive_desc(pool_bkwd_desc,
+                                              cpu_engine,
+                                              pool_fwd_prim_desc);
+      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+                      original_input_dims_nchw,
+                      this->data_format_mkldnn_,
+                      &output_tensor_diff_src);
+
+      output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src);
+
+      this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
+                          &input_gradient_diff_dst,
+                          &output_diff_src,
+                          memory::primitive_desc(
+                              target_diff_dst_md,
+                              cpu_engine));
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                      ", message: " + string(e.message) +
+                      ", in file " + string(__FILE__) + ":" +
+                      std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                      errors::Aborted("Compute received an exception:",
+                                      error_msg));
+    }
+  }  // Compute
+
+ private:
+  // 0. Input("orig_input_shape: int32")
+  // 1. Input("grad: T")
+  const int kInputTensorIndexInputShape = 0;
+  const int kInputTensorIndexInputGradient = 1;
+
+  memory::desc ConfigureOriginalInput(OpKernelContext* context,
+        const Tensor& tensor_original_input_shape,
+        const MklDnnShape& original_input_mkl_shape,
+        memory::dims* original_input_dims_mkl_order,
+        MklPoolParameters* pool_params,
+        TensorShape* input_tensor_shape) {
+    CHECK_NOTNULL(original_input_dims_mkl_order);
+    CHECK_NOTNULL(pool_params);
+    CHECK_NOTNULL(input_tensor_shape);
+    // For AvgPoolGrad, we only get the size of the original input because
+    // The original data is irrelvant.
+    auto shape_vec = tensor_original_input_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_original_input_shape.NumElements(); ++i) {
+      input_tensor_shape->AddDim(shape_vec(i));
+    }
+
+    return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+                                              context,
+                                              tensor_original_input_shape,
+                                              original_input_mkl_shape,
+                                              original_input_dims_mkl_order,
+                                              pool_params,
+                                              *input_tensor_shape);
+}
+
+  void SanityCheckInputs(OpKernelContext* context,
+                        const Tensor& tensor_in_shape,
+                        const Tensor& input_gradient_tensor,
+                        const MklDnnShape& original_input_mkl_shape,
+                        const MklDnnShape& input_gradient_mkl_shape) {
+    if (!original_input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
+          tensor_in_shape.NumElements() == 4,
+          errors::InvalidArgument("original input shape must be "
+                "1-dimensional and 4 elements"));
+    } else {
+      OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 &&
+          original_input_mkl_shape.DimSize(0) == 4,
+          errors::InvalidArgument("original input shape must be "
+                "1-dimensional and 4 elements"));
+    }
+
+    if (!input_gradient_mkl_shape.IsMklTensor()) {
+      // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
+      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("Gradient shape must be "
+                              "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
+          errors::InvalidArgument("Gradient shape must be "
+                              "4-dimensional"));
+    }
+  }
+};  // MklAvgPoolingGradOp
+
+
+
+#endif  // INTEL_MKL_DNN
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
@@ -427,3 +728,4 @@
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
+

diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index e6673b2..d0175df 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc

@@ -1,11 +1,8 @@
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -33,11 +30,22 @@
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::concat;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// List of TensorShape objects. Used in Concat/Split layers.
+typedef std::vector<TensorShape> TensorShapeList;
+
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
+
 // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
 // reference inputs.
 // --------------------------------------------------------------------------
@@ -55,6 +63,8 @@
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
+#ifndef INTEL_MKL_DNN
+
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
@@ -139,8 +149,89 @@
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
+
+#else  // MKL_DNN
+
+void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
+                        const TensorShapeList& input_shapes) {
+    const Tensor* concat_dim_tensor;
+    const char* axis_attribute_name =
+        AxisArgName == NAME_IS_AXIS
+            ? "axis"
+            : AxisArgName == NAME_IS_CONCAT_DIM ? "concat_dim" : "<invalid>";
+    OP_REQUIRES_OK(c, c->input(axis_attribute_name, &concat_dim_tensor));
+    OP_REQUIRES(c, IsLegacyScalar(concat_dim_tensor->shape()),
+                errors::InvalidArgument(
+                    axis_attribute_name,
+                    " tensor should be a scalar integer, but got shape ",
+                    concat_dim_tensor->shape().DebugString()));
+    const int32 concat_dim =
+        internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    // Instead of accessing values from context, we use input to Compute.
+    const int N = values.size();
+    const int input_dims = input_shapes[0].dims();
+    const TensorShape& input_shape = input_shapes[0];
+
+    int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
+                errors::InvalidArgument(
+                    "ConcatOp : Expected concatenating dimensions in the range "
+                    "[",
+                    -input_dims, ", ", input_dims, "), but got ", concat_dim));
+    // Note that we reduce the concat of n-dimensional tensors into a two
+    // dimensional concat. Assuming the dimensions of any input/output
+    // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
+    // the dimension indicated with size y0, we flatten it to {x, y}, where y =
+    // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(N);
+    int64 inputs_flat_dim0 = 1;
+    for (int d = 0; d < axis; ++d) {
+      inputs_flat_dim0 *= input_shape.dim_size(d);
+    }
+    int64 output_concat_dim = 0;
+    const bool input_is_scalar = IsLegacyScalar(input_shape);
+    for (int i = 0; i < N; ++i) {
+      const auto in = values[i];
+      const bool in_is_scalar = IsLegacyScalar(input_shapes[i]);
+      OP_REQUIRES(
+          c, (input_shapes[i].dims() == input_dims) ||
+              (input_is_scalar && in_is_scalar),
+          errors::InvalidArgument(
+              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", input_shapes[i].DebugString()));
+      if (in.NumElements() > 0) {
+        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
+      }
+      output_concat_dim += input_shapes[i].dims() > 0 ?
+                           input_shapes[i].dim_size(axis) : 1;
+    }
+
+    TensorShape output_shape(input_shape);
+    if (output_shape.dims() == 0) {
+      output_shape.AddDim(output_concat_dim);
+    } else {
+      output_shape.set_dim(axis, output_concat_dim);
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() > 0) {
+      int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
+      auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+    }
+  }
+
+#endif
 };
 
+#ifndef INTEL_MKL_DNN
+
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
 // --------------------------------------------------------------------------
@@ -327,6 +418,7 @@
     OP_REQUIRES_OK(context, context->status());
   }
 
+
  private:
   typedef struct {
     TensorFormat data_format;
@@ -435,8 +527,284 @@
         mkl_tensor->flat<uint8>().data(),
         mkl_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // overloading methods with input shapes as a list of TensorShape's
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++) {
+      converted_values.push_back(values[i]);
+    }
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values);
+
+    // Set dummy Mkl tensor as output Mkl tensor for this op.
+    MklShape mkl_tensor_mkl_shape;
+    mkl_tensor_mkl_shape.SetMklTensor(false);
+    mkl_tensor_mkl_shape.SetDimensions(4);
+    Tensor* mkl_tensor = nullptr;
+    TensorShape mkl_tensor_tf_shape;
+    mkl_tensor_tf_shape.AddDim(
+        SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
+    int tf_output_index = 0;
+    context->allocate_output(
+        GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
+        mkl_tensor_tf_shape, &mkl_tensor);
+    mkl_tensor_mkl_shape.SerializeMklShape(
+        mkl_tensor->flat<uint8>().data(),
+        mkl_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
 };
 
+#else
+
+// --------------------------------------------------------------------------
+//                      Mkl Concat Op
+// --------------------------------------------------------------------------
+
+template <typename Device, typename T, AxisArgumentName AxisArgName>
+class MklConcatOp : public OpKernel {
+ private:
+  TensorFormat data_format_;
+  EigenConcatBaseOp<Device, T, AxisArgName> eigen_concat_op_;
+
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit MklConcatOp(OpKernelConstruction* c)
+      : OpKernel(c), eigen_concat_op_(c) {}
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      OpInputList input_tensors;
+      GetMklInputList(context, "values", &input_tensors);
+      const int N = input_tensors.size();
+
+      // Get Tensor shapes.
+      std::vector<MklDnnShape> input_shapes(N);
+      GetMklShapeList(context, "values", &input_shapes);
+
+      const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
+                    ? MklGetInput(context, 0) : MklGetInput(context, N);
+      // Sanity checks
+      OP_REQUIRES(context, IsLegacyScalar(concat_dim_tensor.shape()),
+        errors::InvalidArgument(
+            "Concat dim tensor should be a scalar integer, but got shape ",
+            concat_dim_tensor.shape().DebugString()));
+      int32 concat_dim = internal::SubtleMustCopy(
+                           concat_dim_tensor.scalar<int32>()());
+      if (concat_dim < 0) concat_dim = N + concat_dim;
+
+      // check that ranks of all tensors match
+      // and that their shapes match except for concat_dim.
+      int i = 0;
+      bool invoke_eigen = false;
+      bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor() ?
+                                         input_shapes[0].GetTfShape() :
+                                         input_tensors[0].shape();
+      size_t expected_dims = expected_shape.dims();
+      for (auto& s : input_shapes) {
+        if (s == expected_shape) {++i; continue;}
+
+        TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
+                      input_tensors[i].shape();
+        size_t s_dims = s_shape.dims();
+
+        OP_REQUIRES(context, s_dims == expected_dims,
+                  errors::InvalidArgument(
+                      "_MklConcatOp : Ranks of all input tensors should match:"
+                      " input dimensions = ",
+                      s_dims, " vs. expected rank = ", expected_dims));
+
+        for (int d = 0; d < expected_dims; ++d) {
+          if (d == concat_dim) continue;
+
+          size_t expected_size = expected_shape.dim_size(d);
+          size_t s_size = s_shape.dim_size(d);
+          OP_REQUIRES(
+            context, expected_size == s_size,
+            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
+                    "should match: shape[0][", d, "]= ", expected_size,
+                    " vs. shape[", i, "][", d, "] = ", s_size));
+        }
+
+        if (s.IsMklTensor())
+          are_all_tf_inputs = false;
+        else
+          are_all_mkl_inputs = false;
+
+        if (s_dims != 4) invoke_eigen = true;
+        ++i;
+      }
+
+      // All inputs are not in one format (TF or MKL). This is mixed input case.
+      // We can potentially optimize this case by converting all TF inputs
+      // to Mkl format. But currently, we fall to Eigen for this case.
+      // It may be possible to convert inputs that in TF format to Mkl
+      // format and avoid calling eigen version.
+      if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
+
+      // Temporary fallback to Eigen until MKLDNN Concat performance
+      // is improved. To be removed.
+      invoke_eigen = true;
+
+      // Call Eigen library
+      if (invoke_eigen) {
+        TensorShapeList tf_input_shapes;
+        i = 0;
+        for (auto& s : input_shapes) {
+          TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
+                                input_tensors[i].shape();
+          tf_input_shapes.push_back(s_shape);
+          ++i;
+        }
+        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        return;
+      }
+
+      memory::dims dst_dims;
+      if (are_all_mkl_inputs)
+        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+      else
+        // When all the inputs are in Tensorflow format, we don't know
+        // what is the input data format. In that case, we just use
+        // output format that is same as input formats.
+        dst_dims = TFShapeToMklDnnDims(input_tensors[0].shape());
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
+      int64 dst_concat_dim_size = 0;
+      for (int k =0; k < N; k++) {
+        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
+        memory::dims src_dims;
+
+        // Same comment as dst_dims for src_dims.
+        src_dims = (is_mkl_tensor) ?
+                   TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) :
+                   TFShapeToMklDnnDims(input_tensors[k].shape());
+
+        dst_concat_dim_size += src_dims[concat_dim];
+        auto src_md = is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+          // It does not matter what data format we use here (NHWC or NCHW).
+          // We just need to ensure that output of Concat uses same data format
+          // as input.
+                  memory::desc(src_dims, MklDnnType<T>(), memory::format::nhwc);
+
+        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+        srcs_pd.push_back(src_mpd);
+      }
+      dst_dims[concat_dim] = dst_concat_dim_size;
+
+      MklDnnData<T> dst(&cpu_engine);
+      memory::desc dst_md({}, memory::data_undef, memory::format_undef);
+      memory::dims dst_dims_in_nchw;
+      if (are_all_mkl_inputs) {
+        // Since we are passing a specific format for destination,
+        // we need to have dst_dims in MklDnn order (NCHW).
+        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        dst_dims_in_nchw = MklDnnDimsInNCHW(dst_dims,
+                               MklDnnDataFormatToTFDataFormat(orig_tf_format));
+        // We will set the output in the same format as input to avoid layout
+        // conversions.
+        // Currently we are setting dst format same as input format.
+        // See if we can make this choice in a better way.
+        dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
+                 (memory::format) input_shapes[0].GetMklLayout().data.format);
+      } else {
+        // Again, format does not matter here. We just need to make it same as
+        // input format.
+        dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nhwc);
+      }
+
+      std::vector<primitive::at> inputs;
+      for (int k=0; k < input_tensors.size(); k++)
+        inputs.push_back(srcs[k].GetOpMem());
+
+      // If all inputs are in MKL format, then meaning of concat_dim needs to
+      // change. Value of concat_dim is tied to input Tensorflow data format
+      // (NHWC or NCHW). MklDnn dimensions are in NCHW order. So if Tensorflow
+      // tensors are in NCHW order, then concat_dim semantics is preserved.
+      // But ifinput tensors are in NHWC order, then semantics need to change.
+      // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
+      // then since MklDnn order is NCHW, concat_dim needs to be 1.
+      if (are_all_mkl_inputs)
+        concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+
+      auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
+
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      Tensor* dst_tensor = nullptr;
+      if (are_all_mkl_inputs) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = concat_pd.dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
+                                  input_shapes[0].GetTfDataFormat());
+        tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = MklDnnDimsToTFShape(dst_dims);
+      }
+      AllocateOutputSetMklShape(context, 0, &dst_tensor,
+                                tf_shape_dst, dnn_shape_dst);
+      CHECK_NOTNULL(dst_tensor);
+
+      dst_md = dnn_shape_dst.IsMklTensor() ?
+               dnn_shape_dst.GetMklLayout() : dst_md;
+      dst.SetUsrMem(dst_md, dst_tensor);
+
+      auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
+      std::vector<primitive> net;
+      net.push_back(concat_op);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+               ", message: " + string(e.message) + ", in file " +
+               string(__FILE__) + ":" + std::to_string(__LINE__);
+        OP_REQUIRES_OK(context, errors::Aborted(
+                "Operation received an exception:", error_msg));
+    }
+  }
+
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++)
+      converted_values.push_back(values[i]);
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+
+    // Set output Mkl tensor for this op.
+    MklDnnShape dnn_shape_output;
+    dnn_shape_output.SetMklTensor(false);
+    dnn_shape_output.SetDimensions(4);
+    Tensor* output_tensor = nullptr;
+    TensorShape tf_shape_output;
+    tf_shape_output.AddDim(
+        dnn_shape_output.GetSerializeBufferSize());
+    context->allocate_output(
+        GetTensorMetaDataIndex(0, context->num_outputs()),
+        tf_shape_output, &output_tensor);
+    dnn_shape_output.SerializeMklDnnShape(
+        output_tensor->flat<uint8>().data(),
+        output_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
+};
+
+#endif
+
 /* Use optimized concat for float type only */
 #define REGISTER_MKL_CPU(type)                                              \
   REGISTER_KERNEL_BUILDER(Name("_MklConcat")                                \

diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index f291281..793fa24 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc

@@ -47,11 +47,8 @@
 
 using mkldnn::stream;
 using mkldnn::prop_kind;
-
-using mkldnn::convolution_forward;
 using mkldnn::convolution_backward_weights;
-using mkldnn::convolution_direct;
-
+using mkldnn::memory;
 #endif
 
 namespace tensorflow {
@@ -426,183 +423,229 @@
   TensorFormat data_format_;
 };
 
+#define REGISTER_MKL_FILTER_KERNELS(T)                              \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+              MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
+#undef REGISTER_MKL_FILTER_KERNELS
+
 #else
 
-template <typename Device, class T>
-class MklConv2DCustomBackpropFilterOp : public OpKernel {
+template <typename Device, class T, bool biasEnabled>
+class MklConv2DCustomBackpropFilterOp :
+  public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
+      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+  ~MklConv2DCustomBackpropFilterOp() {}
 
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ private:
+  void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                         const MklDnnShape& filter_mkl_shape,
+                         const MklDnnShape& obp_mkl_shape) {
+    CHECK(!filter_mkl_shape.IsMklTensor())
+      << "Conv2DBackpropFilter: filter should not be in MKL Layout";
   }
 
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto cpu_engine = engine(engine::cpu, 0);
+  size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ }
 
-      MklDnnData<T> input(&cpu_engine);
-      MklDnnData<T> outbackprop(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+  TensorShape MakeInputTfShape(OpKernelContext* context,
+                               const Tensor& input_tensor) {
+    size_t input_idx = 0;
+    return GetTfShape(context, input_idx);
+  }
 
-      // Input tensors
-      const Tensor& input_tensor = MklGetInput(context, 0);
-      const Tensor& filter_tensor = MklGetInput(context, 1);
-      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
+  TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                const Tensor& filter_tensor) {
+    TensorShape filter_tf_shape;
+    CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(
+             filter_tensor.vec<int32>(), &filter_tf_shape).ok(), true);
+    return filter_tf_shape;
+  }
 
-      // Generate input shapes.
-      TensorShape filter_shape;
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_tensor.shape()),
-        errors::InvalidArgument(
-              "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
-              filter_tensor.dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                        filter_tensor.vec<int32>(), &filter_shape));
-      TensorShape input_shape = input_tensor.shape();
-      TensorShape obp_shape = obp_tensor.shape();
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) {
+    // Shape of output of Conv2DBackpropFilter is same as shape of filter.
+    return fwd_filter_dims;
+  }
 
-      // By default, all dims are in MKL order. Only dims in TF order
-      // are those with prefix tf_order.
-      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
-      memory::dims padding_l, padding_r, strides, fwd_output_dims;
-      memory::dims fwd_output_dims_tf_order;
+  memory::format GetOutputFormat(const memory::format data_format) {
+    // Output layout is Tensorflow's filter layout (HWIO).
+    return memory::format::hwio;
+  }
 
-      // Get forward convolution parameters.
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_shape, filter_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
-      if (!context->status().ok()) return;
+  void CreatePrimitive(OpKernelContext* context,
+                       const engine& cpu_engine,
+                       const convolution_forward::primitive_desc& conv_fwd_pd,
+                       MklDnnData<T>* input, MklDnnData<T>* filter,
+                       MklDnnData<T>* outbackprop, MklDnnData<T>* output,
+                       Tensor** output_tensor,
+                       const memory::dims& strides,
+                       const memory::dims& padding_l,
+                       const memory::dims& padding_r,
+                       padding_kind padding,
+                       const memory::dims& bwd_output_dims,
+                       memory::format bwd_output_format) {
+    CHECK_NOTNULL(context);
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(outbackprop);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(output_tensor);
 
-      // Create Convolution forward descriptor since Convolution backward
-      // API needs it. For that, we first need to create input, filter
-      // and output memory descriptors.
-      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md = memory::desc(fwd_input_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_filter_md = memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                        memory::format::hwio);
-      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_src_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
+    MklDnnData<T>* bias_grad = nullptr;
+    int depth = 0;
+    if (biasEnabled) {
+      // Data structure for bias_grad
+      bias_grad = new MklDnnData<T> (&cpu_engine);
+      TensorShape obp_tf_shape = GetTfShape(context, 2);
+      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat()
+                == FORMAT_NCHW) ?
+          obp_tf_shape.dim_size(1) : obp_tf_shape.dim_size(3);
+      memory::dims bias_grad_dims = {depth};
+      bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x);
+    }
 
-      // Allocate output tensor and shape
-      // TODO(nhasabni): Update this when support for MKL layout is added.
-      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
-      TensorShape tf_output_shape(filter_shape);
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      Tensor* output_tensor = nullptr;
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
+    // Create convolution backward weights primitive.
+    auto bwd_desc = (biasEnabled && (bias_grad != nullptr))?
+        convolution_backward_weights::desc(convolution_direct,
+                                input->GetOpMemDesc(), output->GetOpMemDesc(),
+                                bias_grad->GetOpMemDesc(),
+                                outbackprop->GetOpMemDesc(), strides, padding_l,
+                                padding_r, padding) :
+        convolution_backward_weights::desc(convolution_direct,
+                          input->GetOpMemDesc(), output->GetOpMemDesc(),
+                          outbackprop->GetOpMemDesc(), strides, padding_l,
+                          padding_r, padding);
 
-      // Create memory for user data.
-      // Describe how the inputs and outputs of Convolution look like. Also
-      // specify buffers containing actual input and output data.
-      // Although input shape required is in MKL-DNN order, the layout is
-      // Tensorflow's layout (NHWC or NCHW depending on data format).
-      input.SetUsrMem(fwd_input_dims, mkl_data_format, &input_tensor);
-      // Outbackprop shape is NHWC or NCHW depending on data format. Since
-      // GetInputSizeInMklOrder function returns size in that order we just use
-      // use that function directly.
-      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
-      if (!context->status().ok()) return;
-      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
-      // Although output shape required is in MKL-DNN order,
-      // layout is Tensorflow's filter layout (HWIO)
-      // Shape of output of Conv2DBackpropInput is same as shape of filter.
-      memory::dims bwd_output_dims = fwd_filter_dims;
-      output.SetUsrMem(bwd_output_dims, memory::format::hwio, output_tensor);
+    auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
+                                                            cpu_engine,
+                                                            conv_fwd_pd);
 
-      // Create memory descriptors for convolution data w/ no specified format.
-      input.SetOpMemDesc(fwd_input_dims, memory::format::any);
-      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
-      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
+    // Allocate output tensor.
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
+                         bwd_output_format, output_tensor);
 
-      // Create convolution backward weights primitive.
-      auto bwd_desc = convolution_backward_weights::desc(convolution_direct,
-                          input.GetOpMemDesc(), output.GetOpMemDesc(),
-                          outbackprop.GetOpMemDesc(), strides, padding_l,
-                          padding_r, TFPaddingToMklDnnPadding(padding_));
+    CHECK_NOTNULL(*output_tensor);
+    // Set buffer handle using allocated output tensor.
+    output->SetUsrMemDataHandle(*output_tensor);
 
-      auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
-                                                              cpu_engine,
-                                                              fwd_pd);
+    if (biasEnabled && (bias_grad != nullptr)) {
+      // Allocate bias_grad tensor
+      TensorShape bias_grad_shape({depth});
+      Tensor* bias_grad_tensor = nullptr;
+      AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor);
+      memory::dims bias_grad_dims = {depth};
+      // Since Bias is 1D, we use format::x from MKLDNN to represent it.
+      auto bias_grad_md = memory::desc({bias_grad_dims}, MklDnnType<T>(),
+                                       memory::format::x);
+      bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor);
+      bias_grad->SetUsrMemDataHandle(bias_grad_tensor);
+    }
 
-      PrepareAndExecutePrimitive(bwd_pd, &input, &outbackprop, &output);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
+    if (biasEnabled && (bias_grad != nullptr)) {
+      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output, bias_grad);
+    } else {
+      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output);
     }
   }
 
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
+  // Allocate output tensor.
+  void AllocateOutputTensor(OpKernelContext* context,
+                  const convolution_backward_weights::primitive_desc& conv_pd,
+                  const memory::dims& output_dims_mkl_order,
+                  memory::format output_tf_format, Tensor** output_tensor) {
+      CHECK_NOTNULL(output_tensor);
+
+      // For BackpropFilter, we convert the output tensor back in Tensorflow
+      // layout. Because typically, BackpropFilter is the last operator in the
+      // graph that emit filter gradient that is provided to ApplyGradient
+      // method to update the filter. But it may be possible to eliminate this
+      // by forwarding filter in MKL layout if we support ApplyGradient method
+      // for MKL layout propagation.
+      MklDnnShape output_mkl_shape;
+      output_mkl_shape.SetMklTensor(false);
+      // output_dims_mkl_order is in OIHW format.
+      // Allocate shape of TF tensor in HWIO format.
+      TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
+                                   output_dims_mkl_order[MklDnnDims::Dim_W],
+                                   output_dims_mkl_order[MklDnnDims::Dim_I],
+                                   output_dims_mkl_order[MklDnnDims::Dim_O]});
+      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                                output_mkl_shape);
+  }
+
+  // Allocate tensor for bias grad
+  void AllocateBiasGradTensor(OpKernelContext* context,
+                              const TensorShape& bias_grad_shape,
+                              Tensor** bias_grad_tensor) {
+    CHECK_NOTNULL(bias_grad_tensor);
+
+    MklDnnShape bias_grad_mkl_shape;
+    bias_grad_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 1, bias_grad_tensor, bias_grad_shape,
+                              bias_grad_mkl_shape);
+  }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
                   const convolution_backward_weights::primitive_desc& conv_pd,
                   MklDnnData<T>* input, MklDnnData<T>* obp,
-                  MklDnnData<T>* output) {
+                  MklDnnData<T>* output, MklDnnData<T>* bias_grad = nullptr) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     input->CheckReorderToOpMem(conv_pd.src_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
+    // For BackpropFilter, we convert the output tensor back in Tensorflow
+    // layout.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
                                       conv_pd.diff_weights_primitive_desc());
 
-    net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                    obp->GetOpMem(), output->GetOpMem()));
+    if (biasEnabled && (bias_grad != nullptr)) {
+      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
+                                      obp->GetOpMem(), output->GetOpMem(),
+                                      bias_grad->GetOpMem()));
+    } else {
+      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
+                                      obp->GetOpMem(), output->GetOpMem()));
+    }
 
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
     if (output_reorder_required) {
       output->InsertReorderToUserMem(&net);
     }
 
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
-#endif
 
 #define REGISTER_MKL_FILTER_KERNELS(T)                              \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+              MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);\
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")  \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+              MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>); \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")  \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+              MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
+
+#endif  // INTEL_MKL_DNN
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL

diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 4a47d04..df51df9 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc

@@ -49,9 +49,6 @@
 
 using mkldnn::stream;
 using mkldnn::prop_kind;
-
-using mkldnn::convolution_forward;
-using mkldnn::convolution_direct;
 using mkldnn::convolution_backward_data;
 #endif
 
@@ -362,143 +359,117 @@
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp : public OpKernel {
+class MklConv2DCustomBackpropInputOp :
+  public MklConv2DBackpropCommonOp<Device, T> {
  public:
-  ~MklConv2DCustomBackpropInputOp() {}
   explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format_str;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto cpu_engine = engine(engine::cpu, 0);
-
-      MklDnnData<T> filter(&cpu_engine);
-      MklDnnData<T> outbackprop(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
-
-      // Input tensors
-      const Tensor& input_tensor = MklGetInput(context, 0);
-      const Tensor& filter_tensor = MklGetInput(context, 1);
-      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
-
-      // Generate input shape.
-      TensorShape input_shape;
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor.shape()),
-        errors::InvalidArgument(
-              "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
-              input_tensor.dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                        input_tensor.vec<int32>(), &input_shape));
-      TensorShape filter_shape = filter_tensor.shape();
-      TensorShape obp_shape = obp_tensor.shape();
-
-      // By default, all dims are in MKL order. Only dims in TF order
-      // are those with prefix tf_order.
-      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
-      memory::dims padding_l, padding_r, strides, fwd_output_dims;
-      memory::dims fwd_output_dims_tf_order;
-
-      // Get forward convolution parameters.
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_shape, filter_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
-      if (!context->status().ok()) return;
-
-      // Create Convolution forward descriptor since Convolution backward
-      // API needs it. For that, we first need to create input, filter
-      // and output memory descriptors.
-      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md = memory::desc(fwd_input_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_filter_md = memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                        memory::format::hwio);
-      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_src_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
-
-      // Allocate output tensor and shape
-      // TODO(nhasabni): Update this when support for MKL layout is added.
-      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
-      TensorShape tf_output_shape(input_shape);
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      Tensor* output_tensor = nullptr;
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
-
-      // Create memory for user data.
-      // Describe how the inputs and outputs of Convolution look like. Also
-      // specify buffers containing actual input and output data.
-      // Although input shape required is in MKL-DNN order, the layout is
-      // Tensorflow's layout (NHWC or NCHW depending on data format).
-      // Although filter shape (filter_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (HWIO).
-      // Shape of Conv2DBackpropInput's filter is same as that of Conv2D filter.
-      filter.SetUsrMem(fwd_filter_dims, memory::format::hwio, &filter_tensor);
-      // Outbackprop shape is NHWC or NCHW depending on data format. Since
-      // GetInputSizeInMklOrder function returns size in that order we just use
-      // use that function directly.
-      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
-      if (!context->status().ok()) return;
-      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
-      // Although output shape required is in MKL-DNN order,
-      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
-      // Shape of output of Conv2DBackpropInput is same as shape of 'input'
-      // of Conv2D.
-      memory::dims bwd_output_dims = fwd_input_dims;
-      output.SetUsrMem(bwd_output_dims, mkl_data_format, output_tensor);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      filter.SetOpMemDesc(fwd_filter_dims, memory::format::any);
-      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
-      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
-
-      // Create convolution backward data primitive.
-      auto bwd_desc = convolution_backward_data::desc(convolution_direct,
-                          output.GetOpMemDesc(), filter.GetOpMemDesc(),
-                          outbackprop.GetOpMemDesc(), strides, padding_l,
-                          padding_r, TFPaddingToMklDnnPadding(padding_));
-
-      auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
-                                                              cpu_engine,
-                                                              fwd_pd);
-
-      PrepareAndExecutePrimitive(bwd_pd, &filter, &outbackprop, &output);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
-    }
-  }
+      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+  ~MklConv2DCustomBackpropInputOp() {}
 
  private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
+  void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                         const MklDnnShape& filter_mkl_shape,
+                         const MklDnnShape& obp_mkl_shape) {
+    // Tensor that feeds to 'Input' slot of BackpropInput is always just a shape
+    // of the Tensor and never an actual tensor. So it will never be in MKL
+    // layout.
+    CHECK(!input_mkl_shape.IsMklTensor())
+      << "Conv2DBackpropInput: input should not be in MKL Layout";
+  }
+
+  size_t GetInputTensorIndexWithSizes() { return 0; /* input index */ }
+
+  TensorShape MakeInputTfShape(OpKernelContext* context,
+                               const Tensor& input_tensor) {
+    TensorShape input_tf_shape;
+    CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(input_tensor.vec<int32>(),
+                                         &input_tf_shape).ok(), true);
+    return input_tf_shape;
+  }
+
+  TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                const Tensor& filter_tensor) {
+    size_t filter_idx = 1;
+    return GetTfShape(context, filter_idx);
+  }
+
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) {
+    // Output Shape of Conv2DBackpropInput is same as shape of Conv2D 'input'.
+    return fwd_input_dims;
+  }
+
+  memory::format GetOutputFormat(const memory::format data_format) {
+    // Output layout is Tensorflow's layout in data format order.
+    return data_format;
+  }
+
+  void CreatePrimitive(OpKernelContext* context,
+                       const engine& cpu_engine,
+                       const convolution_forward::primitive_desc& conv_fwd_pd,
+                       MklDnnData<T>* input, MklDnnData<T>* filter,
+                       MklDnnData<T>* outbackprop, MklDnnData<T>* output,
+                       Tensor** output_tensor,
+                       const memory::dims& strides,
+                       const memory::dims& padding_l,
+                       const memory::dims& padding_r,
+                       padding_kind padding,
+                       const memory::dims& bwd_output_dims,
+                       memory::format bwd_output_format) {
+    CHECK_NOTNULL(context);
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(outbackprop);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(output_tensor);
+
+    // Create convolution backward data primitive.
+    auto bwd_desc = convolution_backward_data::desc(convolution_direct,
+                      output->GetOpMemDesc(), filter->GetOpMemDesc(),
+                      outbackprop->GetOpMemDesc(), strides, padding_l,
+                      padding_r, padding);
+
+    auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
+                                                          cpu_engine,
+                                                          conv_fwd_pd);
+
+
+    // Allocate output tensor in TensorFlow and MKL layout.
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
+                         bwd_output_format, output_tensor);
+    CHECK_NOTNULL(*output_tensor);
+    // Set buffer handle using allocated output tensor.
+    output->SetUsrMemDataHandle(*output_tensor);
+
+    PrepareAndExecutePrimitive(bwd_pd, filter, outbackprop, output);
+  }
+
+  // Allocate output tensor.
+  void AllocateOutputTensor(OpKernelContext* context,
+                  const convolution_backward_data::primitive_desc& conv_pd,
+                  const memory::dims& output_dims_mkl_order,
+                  memory::format output_tf_format, Tensor** output_tensor) {
+      CHECK_NOTNULL(output_tensor);
+
+      // Output primitive descriptor for backward data is diff_src.
+      auto dst_pd = conv_pd.diff_src_primitive_desc();
+
+      // Allocate shape of Mkl tensor.
+      MklDnnShape output_mkl_shape;
+      output_mkl_shape.SetMklTensor(true);
+      output_mkl_shape.SetMklLayout(&dst_pd);
+      output_mkl_shape.SetElemType(MklDnnType<T>());
+      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                   output_dims_mkl_order, output_tf_format);
+
+      // Allocate shape of TF tensor.
+      TensorShape output_tf_shape;
+      output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
+
+      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                                output_mkl_shape);
+  }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
@@ -511,22 +482,9 @@
     filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
-    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_pd.diff_src_primitive_desc());
-
     net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(),
                                     filter->GetOpMem(), output->GetOpMem()));
 
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
-    if (output_reorder_required) {
-      output->InsertReorderToUserMem(&net);
-    }
-
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };

diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index a9872b8..04268f2 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc

@@ -40,8 +40,7 @@
 #include "tensorflow/core/util/tensor_format.h"
 
 #include "tensorflow/core/util/mkl_util.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
+
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
@@ -51,6 +50,9 @@
 
 using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
 
 namespace tensorflow {
@@ -288,10 +290,8 @@
     mkl_filter_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd,
                                              dnnResourceFilter);
 
-    size_t filter_sizes[4] = {static_cast<size_t>(filter.dim_size(0)),
-                              static_cast<size_t>(filter.dim_size(1)),
-                              static_cast<size_t>(filter.dim_size(2)),
-                              static_cast<size_t>(filter.dim_size(3))};
+    size_t filter_sizes[4] = {filter.dim_size(0), filter.dim_size(1),
+                              filter.dim_size(2), filter.dim_size(3)};
     mkl_filter_output_mkl_shape.SetTfLayout(filter.dims(), filter_sizes,
                                             mkl_context.filter_strides);
 
@@ -514,6 +514,12 @@
       const Tensor& src_tensor = MklGetInput(context, src_idx);
       const Tensor& filter_tensor = MklGetInput(context, filter_idx);
 
+      MklDnnShape src_mkl_shape, filter_mkl_shape;
+      GetMklShape(context, src_idx, &src_mkl_shape);
+      GetMklShape(context, filter_idx, &filter_mkl_shape);
+      CHECK(!filter_mkl_shape.IsMklTensor())
+        << "Conv2D filter should not be in MKL Layout";
+
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
       MklDnnData<T> output(&cpu_engine);
@@ -523,8 +529,9 @@
 
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(src_tensor.shape(),
-                                         filter_tensor.shape(),
+      auto src_tf_shape = GetTfShape(context, src_idx);
+      auto filter_tf_shape = GetTfShape(context, filter_idx);
+      conv_utl.GetConvFwdSizesInMklOrder(src_tf_shape, filter_tf_shape,
                                          &src_dims, &filter_dims, &strides,
                                          &output_dims_tf_order,
                                          &output_dims_mkl_order, &padding_l,
@@ -532,58 +539,47 @@
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
-      TensorShape tf_output_shape({output_dims_tf_order[0],
-                                output_dims_tf_order[1],
-                                output_dims_tf_order[2],
-                                output_dims_tf_order[3]});
-      Tensor* output_tensor = nullptr;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
+      TensorShape output_tf_shape = MklDnnDimsToTFShape(output_dims_tf_order);
 
       // Forward filter in TF format from input at index 1 to output at index 1.
       ForwardTfTensorInToOut(context, 1, 1);
 
-      if (tf_output_shape.num_elements() == 0) {
+      // Corner cases: output with 0 elements and 0 batch size.
+      Tensor* output_tensor = nullptr;
+      if (output_tf_shape.num_elements() == 0 ||
+          output_dims_tf_order[0] == 0) {
         // TODO(jbobba): Verify correctness here
         //               Need semantics for Null MKL tensor
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 0, &output_tensor, src_tf_shape,
+                                output_mkl_shape);
         return;
       }
 
-      // Corner case to handle 0 batch size.
-      if (output_dims_tf_order[0] == 0) {
-        // Nothing to do, allocate output tensor and return
-        // TODO(nhasabni): remove this code later once serialization
-        // in MKL-DNN is supported.
-        AllocateOutputSetMklShape(context, 0, &output_tensor,
-                                  src_tensor.shape(), mkl_output_mkl_shape);
-        return;
-      } else {
-        // Otherwise regular output tensor allocation
-        // Allocate output tensor.
-      }
-      CHECK_NOTNULL(output_tensor);
-
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
-      // Although input shape (src_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (NHWC or NCHW depending on data
-      // format).
-      src.SetUsrMem(src_dims, TFDataFormatToMklDnnDataFormat(data_format_),
-                    const_cast<void*>(static_cast<const void*>(
-                    src_tensor.flat<T>().data())));
+      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input Tf layout. For TF layout, although input shape
+      // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
+      // layout (NHWC or NCHW depending on data format).
+      auto src_md = src_mkl_shape.IsMklTensor()
+                    ? src_mkl_shape.GetMklLayout()
+                    : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+      src.SetUsrMem(src_md, &src_tensor);
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
-      filter.SetUsrMem(filter_dims, memory::format::hwio,
-                       const_cast<void*>(static_cast<const void*>(
-                       filter_tensor.flat<T>().data())));
-      // Although output shape (output_dims) required is in MKL-DNN order,
-      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
-      output.SetUsrMem(output_dims_mkl_order,
-                       TFDataFormatToMklDnnDataFormat(data_format_),
-                       output_tensor->flat<T>().data());
+      auto filter_md = filter_mkl_shape.IsMklTensor()
+                    ? filter_mkl_shape.GetMklLayout()
+          : memory::desc(filter_dims, MklDnnType<T>(), memory::format::hwio);
+      filter.SetUsrMem(filter_md, &filter_tensor);
+      // Set output shape (output_dims) required in MKL-DNN order.
+      // Currently, we set output layout as Tensorflow's layout (NHWC or NCHW
+      // depending on data format). But later we propagate Mkl layout of the
+      // output to the next op directly.
+      output.SetUsrMem(output_dims_mkl_order, tf_fmt);
 
       // Create memory descriptors for convolution data w/ no specified format.
       src.SetOpMemDesc(src_dims, memory::format::any);
@@ -596,9 +592,7 @@
         memory::dims bias_size;
         conv_utl.GetBiasSizeInMklOrder(2 /* bias idx */, &bias_size);
         const Tensor& bias_tensor = MklGetInput(context, 2);
-        bias.SetUsrMem(bias_size, memory::format::x,
-                       const_cast<void*>(static_cast<const void*>(
-                       bias_tensor.flat<T>().data())));
+        bias.SetUsrMem(bias_size, memory::format::x, &bias_tensor);
         bias.SetOpMemDesc(bias_size, memory::format::any);
 
         // Create convolution primitive with Bias.
@@ -609,6 +603,10 @@
 
         auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
                                                                 cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc,
+                             output_dims_mkl_order, tf_fmt, &output_tensor);
+        // Set data handle for output.
+        output.SetUsrMemDataHandle(output_tensor);
         PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output);
       } else {
         // Create convolution primitive without Bias.
@@ -619,6 +617,10 @@
 
         auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
                                                                 cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
+                             tf_fmt, &output_tensor);
+        // Set data handle for output.
+        output.SetUsrMemDataHandle(output_tensor);
         PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output);
       }
     } catch (mkldnn::error &e) {
@@ -636,23 +638,44 @@
   Padding padding_;
   TensorFormat data_format_;
 
+  // Allocate output tensor.
+  void AllocateOutputTensor(
+                  OpKernelContext* context,
+                  const convolution_forward::primitive_desc& conv_prim_desc,
+                  const memory::dims& output_dims_mkl_order,
+                  memory::format output_tf_format, Tensor** output_tensor) {
+      CHECK_NOTNULL(output_tensor);
+      auto dst_pd = conv_prim_desc.dst_primitive_desc();
+
+      // Allocate shape of Mkl tensor.
+      MklDnnShape output_mkl_shape;
+      output_mkl_shape.SetMklTensor(true);
+      output_mkl_shape.SetMklLayout(&dst_pd);
+      output_mkl_shape.SetElemType(MklDnnType<T>());
+      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                   output_dims_mkl_order, output_tf_format);
+
+      // Allocate shape of TF tensor.
+      TensorShape output_tf_shape;
+      output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+
+      const int kOutputSlotIdx = 0;
+      AllocateOutputSetMklShape(context, kOutputSlotIdx, output_tensor,
+                                output_tf_shape, output_mkl_shape);
+  }
+
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
                   const convolution_forward::primitive_desc& conv_prim_desc,
                   MklDnnData<T>* src, MklDnnData<T>* filter,
                   MklDnnData<T>* bias, MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
-    // add it to the net before convolution.
+    // add it to the net before convolution. No need to check for output
+    // reorder as we propagate output layout to the next layer.
     std::vector<primitive> net;
     src->CheckReorderToOpMem(conv_prim_desc.src_primitive_desc(), &net);
     filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
-    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_prim_desc.dst_primitive_desc());
-
     // Create convolution primitive and add it to net.
     if (bias) {
       CHECK_EQ(biasEnabled, true);
@@ -665,13 +688,6 @@
                                     filter->GetOpMem(), output->GetOpMem()));
     }
 
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
-    if (output_reorder_required) {
-      output->InsertReorderToUserMem(&net);
-    }
-
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
@@ -688,7 +704,12 @@
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DOp<CPUDevice, T, true>);
+                          MklConv2DOp<CPUDevice, T, true>);         \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU);
 

diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index f0cb37f..47a9b4b 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h

@@ -41,6 +41,12 @@
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::prop_kind;
+
+using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
 #endif
 
 namespace tensorflow {
@@ -108,7 +114,13 @@
   #undef CHECK_BOUNDS
 
     // MKL-DNN always requires input in NCHW format.
-    *input_dims = {input_batch, input_depth, input_rows, input_cols};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_N] = input_batch;
+    mkldnn_sizes[MklDnnDims::Dim_C] = input_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = input_rows;
+    mkldnn_sizes[MklDnnDims::Dim_W] = input_cols;
+
+    *input_dims = mkldnn_sizes;
   }
 
   // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
@@ -156,7 +168,13 @@
 
     // MKL-DNN always needs filter in OIHW format.
     // OIHW = (out_depth, in_depth, rows, cols)
-    *filter_dims = {out_depth, in_depth, filter_rows, filter_cols};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
+    mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
+    mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
+
+    *filter_dims = mkldnn_sizes;
   }
 
   // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
@@ -167,9 +185,9 @@
   GetFilterSizeInMklOrder(size_t src_index, size_t filter_index,
                           memory::dims *filter_dims) {
     CHECK_NOTNULL(filter_dims);
-    const Tensor& input = MklGetInput(context_, src_index);
-    const Tensor& filter = MklGetInput(context_, filter_index);
-    GetFilterSizeInMklOrder(input.shape(), filter.shape(), filter_dims);
+    GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
+                            GetTfShape(context_, filter_index),
+                            filter_dims);
   }
 
   // Calculate Bias size for 2D Convolution. Function does not return
@@ -238,8 +256,12 @@
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
     // MKL-DNN always needs output in NCHW format.
-    *output_dims_mkl_order = {out_batch, out_depth, static_cast<int>(out_rows),
-                   static_cast<int>(out_cols)};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
+    mkldnn_sizes[MklDnnDims::Dim_C] = out_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
+    mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
+    *output_dims_mkl_order = mkldnn_sizes;
 
     // Now handle padding. MKL-DNN uses asymetric padding.
     *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
@@ -261,14 +283,14 @@
     CHECK_NOTNULL(pad_l);
     CHECK_NOTNULL(pad_r);
 
-    const Tensor& input = MklGetInput(context_, src_index);
-    const Tensor& filter = MklGetInput(context_, filter_index);
+    auto input_tf_shape = GetTfShape(context_, src_index);
+    auto filter_tf_shape = GetTfShape(context_, filter_index);
 
-    OP_REQUIRES(context_, input.dims() == 4,
+    OP_REQUIRES(context_, input_tf_shape.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
+                                        input_tf_shape.DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input.shape(), filter.shape(),
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
                                   strides, output_dims_tf_order,
                                   output_dims_mkl_order, pad_l, pad_r);
   }
@@ -309,8 +331,231 @@
   }
 };
 
+/////////////////////////////////////////////////////////////////////
+///  Common class that implements Conv2DBackpropFilter and Input
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, class T>
+class MklConv2DBackpropCommonOp :  public OpKernel {
+ public:
+  ~MklConv2DBackpropCommonOp() {}
+  explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format_str;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      // Prepare common tensors for Conv2DBackpropInput and
+      // Conv2DBackpropFilter.
+      MklDnnData<T> input(&cpu_engine);
+      MklDnnData<T> filter(&cpu_engine);
+      MklDnnData<T> outbackprop(&cpu_engine);
+      MklDnnData<T> output(&cpu_engine);
+
+      // Input tensors
+      const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
+      const Tensor& input_tensor = MklGetInput(context, kInputIdx);
+      const Tensor& filter_tensor = MklGetInput(context, kFilterIdx);
+      const Tensor& outbprop_tensor = MklGetInput(context, kOutbpropIdx);
+
+      MklDnnShape input_mkl_shape, filter_mkl_shape, outbprop_mkl_shape;
+      GetMklShape(context, kInputIdx, &input_mkl_shape);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape);
+      GetMklShape(context, kOutbpropIdx, &outbprop_mkl_shape);
+      // Allow operator-specific sanity checking of shapes.
+      ValidateMklShapes(input_mkl_shape, filter_mkl_shape, outbprop_mkl_shape);
+
+      // Allow operator-specific generation of shapes.
+      // E.g., Conv2DBackpropFilter gets filter as filter_sizes. It is a
+      // tensor containing shape of filter. So filter.shape() is not
+      // a correct way to get filter shape. These operator-specific calls
+      // allow this class to handle this case.
+      TensorShape input_tf_shape = MakeInputTfShape(context, input_tensor);
+      TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
+      TensorShape outbprop_tf_shape = GetTfShape(context, kOutbpropIdx);
+
+      // By default, all dims are in MKL order. Only dims in TF order
+      // are those with prefix tf_order.
+      memory::dims outbprop_dims, fwd_input_dims, fwd_filter_dims;
+      memory::dims padding_l, padding_r, strides, fwd_output_dims;
+      memory::dims fwd_output_dims_tf_order;
+
+      // Get forward convolution parameters.
+      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
+      conv_utl.GetConvFwdSizesInMklOrder(input_tf_shape, filter_tf_shape,
+                                         &fwd_input_dims, &fwd_filter_dims,
+                                         &strides,
+                                         &fwd_output_dims_tf_order,
+                                         &fwd_output_dims,
+                                         &padding_l, &padding_r);
+      if (!context->status().ok()) return;
+
+      // Create Convolution forward descriptor since Convolution backward
+      // API needs it. For that, we first need to create input, filter
+      // and output memory descriptors.
+      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input TF layout. For TF layout, although input shape
+      // required is in MKL-DNN order, the layout is Tensorflow's layout
+      // (NHWC or NCHW depending on data format).
+      auto fwd_input_md = input_mkl_shape.IsMklTensor() ?
+                          input_mkl_shape.GetMklLayout() :
+                       memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
+      // If filter is in MKL layout, then simply grab filter layout; otherwise
+      // construct filter in TF layout. For TF layout, filter is in HWIO format.
+      auto fwd_filter_md = filter_mkl_shape.IsMklTensor() ?
+                          filter_mkl_shape.GetMklLayout() :
+                          memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                       memory::format::hwio);
+      // Tensorflow Output of Conv2D is in data_format order.
+      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(), tf_fmt);
+      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
+            convolution_direct, fwd_input_md, fwd_filter_md, fwd_out_md,
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
+
+      // Create memory for user data. Describe how the inputs and outputs of
+      // Convolution look like. Also specify buffers containing actual input
+      // and output data.
+
+      // Since this is a common class for both Conv2DBackpropFilter and
+      // Conv2DBackpropInput, we skip SetUsrMem call for input tensor (for
+      // Conv2DBackpropInput) and for filter tensor (for
+      // conv2DBackpropFilter) depending on which tensor is int32 type.
+      size_t input_with_sizes = GetInputTensorIndexWithSizes();
+      if (input_with_sizes != kInputIdx) {
+        // Shape of Conv2DBackpropFilter's input is same as Conv2D input.
+        input.SetUsrMem(fwd_input_md, &input_tensor);
+      } else if (input_with_sizes != kFilterIdx) {
+        // Shape of Conv2DBackpropInput's filter is same as Conv2D filter.
+        filter.SetUsrMem(fwd_filter_md, &filter_tensor);
+      }
+
+      conv_utl.GetInputSizeInMklOrder(outbprop_tf_shape, &outbprop_dims);
+      if (!context->status().ok()) return;
+      if (outbprop_mkl_shape.IsMklTensor()) {
+        // If outbackprop is in Mkl layout, then simply grab it.
+        auto outbprop_md = outbprop_mkl_shape.GetMklLayout();
+        outbackprop.SetUsrMem(outbprop_md, &outbprop_tensor);
+      } else {
+        // If outbackprop is in TensorFlow layout, then we need to create memory
+        // descriptor for it. Outbackprop shape is data format order.
+        outbackprop.SetUsrMem(outbprop_dims, tf_fmt, &outbprop_tensor);
+      }
+
+      // Operator specific call to get output shape and data_format.
+      auto bwd_output_dims = GetOutputDims(fwd_input_dims, fwd_filter_dims);
+      auto bwd_output_format = GetOutputFormat(tf_fmt);
+      output.SetUsrMem(bwd_output_dims, bwd_output_format);
+
+      // Create memory descriptors for convolution data w/ no specified format.
+      input.SetOpMemDesc(fwd_input_dims, memory::format::any);
+      filter.SetOpMemDesc(fwd_filter_dims, memory::format::any);
+      outbackprop.SetOpMemDesc(outbprop_dims, memory::format::any);
+      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
+
+      // Operator-specific call to create and execute primitive.
+      Tensor* output_tensor = nullptr;
+      CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
+                      &outbackprop, &output, &output_tensor,
+                      strides, padding_l, padding_r,
+                      TFPaddingToMklDnnPadding(padding_),
+                      bwd_output_dims, bwd_output_format);
+    } catch (mkldnn::error &e) {
+     string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
+                                            error_msg));
+    }
+  }
+
+  /// Pure virtual function to allow operator to check for validity of input
+  /// shapes. Function asserts that input shapes are valid.
+  virtual void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                                 const MklDnnShape& filter_mkl_shape,
+                                 const MklDnnShape& outbprop_mkl_shape) = 0;
+
+  /// Operator-specific function that returns index of input that is
+  /// representing input sizes. For Conv2DBackpropFilter it returns 1 since
+  /// filter for this operator is filter shape. For Conv2DBackpropInput it
+  /// returns 0 (for input).
+  virtual size_t GetInputTensorIndexWithSizes() = 0;
+
+  /// Get TensorFlow shape of input tensor.
+  virtual TensorShape MakeInputTfShape(OpKernelContext* context,
+                                      const Tensor& input_tensor) = 0;
+
+  /// Get TensorFlow shape of filter tensor.
+  virtual TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                       const Tensor& filter_tensor) = 0;
+
+  /// Get shape of output in MKL-DNN order. Computes shape of output from
+  /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims).
+  virtual
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) = 0;
+
+  /// Get data_format of output in MKL-DNN order. If output data format is
+  /// same as input data format, then it simply returns value of data_format
+  /// parameter as it is.
+  virtual memory::format GetOutputFormat(const memory::format data_format) = 0;
+
+  /// Create and execute the primitive storing output in the output_tensor.
+  virtual void CreatePrimitive(OpKernelContext* context,
+    const engine& cpu_engine,
+    const convolution_forward::primitive_desc& conv_fwd_pd,
+    MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
+    MklDnnData<T>* output, Tensor** output_tensor, const memory::dims& strides,
+    const memory::dims& padding_l, const memory::dims& padding_r,
+    padding_kind padding, const memory::dims& bwd_output_dims,
+    memory::format bwd_output_format) = 0;
+
+  // Get the data_format {NCHW, NHWC}
+  TensorFormat GetTFDataFormat () { return data_format_; }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
 #endif  // INTEL_MKL_DNN
 
+/////////////////////////////////////////////////////////////////////
+///  Dummy Mkl op that is just used for operators that are intermediate
+///  output of node fusion in the graph
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklDummyOp : public OpKernel {
+ public:
+  ~MklDummyOp() {}
+
+  explicit MklDummyOp(OpKernelConstruction* context) :
+    OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TF_CHECK_OK(errors::Unimplemented("This is a dummy op."
+                                      "It should not have been invoked."));
+  }
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_

diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index bc9e906..a761562 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc

@@ -25,10 +25,24 @@
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::prop_kind;
+using mkldnn::use_scale_shift;
+using mkldnn::use_global_stats;
+using mkldnn::batch_normalization_forward;
+using mkldnn::batch_normalization_backward;
+#endif
+
 // TODO(inteltf) Address comments from PR 8968.
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
+
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
  public:
@@ -46,7 +60,6 @@
 
   void Compute(OpKernelContext* context) override {
     MklFusedBatchNormOpContext mkl_context;
-
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& scale = MklGetInput(context, 1);
     const Tensor& shift = MklGetInput(context, 2);
@@ -55,6 +68,7 @@
 
     GetMklShape(context, 0, &(mkl_context.mkl_shape_input_shape));
     bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
+
     if (!input_in_mkl_format) {
       OP_REQUIRES(context, input.dims() == 4,
                   errors::InvalidArgument("input must be 4-dimensional",
@@ -69,10 +83,12 @@
     OP_REQUIRES(context, est_mean.dims() == 1,
                 errors::InvalidArgument("estimated_mean must be 1-dimensional",
                                         est_mean.shape().DebugString()));
+
     OP_REQUIRES(
         context, est_variance.dims() == 1,
         errors::InvalidArgument("estimated_variance must be 1-dimensional",
                                 est_variance.shape().DebugString()));
+
     if (is_training_) {
       OP_REQUIRES(context, est_mean.dim_size(0) == 0,
                   errors::InvalidArgument("estimated_mean empty for training",
@@ -258,7 +274,6 @@
             E_SUCCESS);
       }
     }
-
     void MklPrepareContextInputs(OpKernelContext* context,
                                  Tensor* mkl_tmp_input_buf_tensor,
                                  Tensor* mkl_tmp_scale_shift_buf_tensor) {
@@ -325,15 +340,6 @@
   } MklFusedBatchNormOpContext;
 };
 
-#define REGISTER_MKL_CPU(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklFusedBatchNormOp<CPUDevice, T>);
-TF_CALL_float(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
 template <typename Device, typename T>
 class MklFusedBatchNormGradOp : public OpKernel {
  public:
@@ -595,7 +601,7 @@
       mkl_res_batchnorm_bwd[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
 
-      bool mkl_convert_out_backprop;
+     bool mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
       dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
       void* mkl_buf_converted_out_backprop = nullptr;
@@ -675,6 +681,628 @@
     }
   } MklFusedBatchNormGradOpContext;
 };
+#endif
+
+#ifdef INTEL_MKL_DNN
+
+template <typename Device, typename T>
+class MklFusedBatchNormOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const size_t src_index = 0;    // index of src input tensor
+      const size_t scale_index = 1;  // index of scale tensor
+      const size_t shift_index = 2;  // index of shift tensor
+      const size_t mean_index = 3;   // index of est_mean tensor
+      const size_t var_index = 4;    // index of est_variance tensor
+
+      const Tensor& src_tensor          = MklGetInput(context, src_index);
+      const Tensor& scale_tensor        = MklGetInput(context, scale_index);
+      const Tensor& shift_tensor        = MklGetInput(context, shift_index);
+      const Tensor& est_mean_tensor     = MklGetInput(context, mean_index);
+      const Tensor& est_variance_tensor = MklGetInput(context, var_index);
+
+      MklDnnShape dnn_shape_src;
+      GetMklShape(context, src_index, &dnn_shape_src);
+
+      if (dnn_shape_src.IsMklTensor()) {
+        OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        src_tensor.shape().DebugString()));
+      } else {
+        OP_REQUIRES(context, src_tensor.dims() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        src_tensor.shape().DebugString()));
+      }
+      OP_REQUIRES(context, scale_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "scale must be 1-dimensional",
+                      scale_tensor.shape().DebugString()));
+      OP_REQUIRES(context, shift_tensor.dims() == 1,
+                  errors::InvalidArgument("offset must be 1-dimensional",
+                                        shift_tensor.shape().DebugString()));
+      OP_REQUIRES(context, est_mean_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "estimated_mean must be 1-dimensional",
+                      est_mean_tensor.shape().DebugString()));
+      OP_REQUIRES(context, est_variance_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "estimated_variance must be 1-dimensional",
+                      est_variance_tensor.shape().DebugString()));
+
+      if (is_training_) {
+        OP_REQUIRES(context, est_mean_tensor.dim_size(0) == 0,
+                    errors::InvalidArgument(
+                        "estimated_mean must be empty for training",
+                        est_mean_tensor.shape().DebugString()));
+        OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0,
+                    errors::InvalidArgument(
+                        "estimated_variance must be empty for training",
+                        est_variance_tensor.shape().DebugString()));
+      }
+
+      if (dnn_shape_src.IsMklTensor())
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      else
+        ExtractParams(context);
+
+      // Indices of output tensors
+      const size_t dst_index = 0;
+      const size_t batch_mean_index = 1;
+      const size_t batch_variance_index = 2;
+      const size_t saved_mean_index = 3;
+      const size_t saved_variance_index = 4;
+
+      // allocate batch mean output tensor
+      Tensor* batch_mean_tensor = nullptr;
+      MklDnnShape mkl_shape_batch_mean;
+      mkl_shape_batch_mean.SetMklTensor(false);
+      AllocateOutputSetMklShape(context,
+                                batch_mean_index,
+                                &batch_mean_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_batch_mean);
+      CHECK_NOTNULL(batch_mean_tensor);
+
+      // Batch variance
+      Tensor* batch_variance_tensor = nullptr;
+      MklDnnShape mkl_shape_batch_variance;
+      mkl_shape_batch_variance.SetMklTensor(false);
+      AllocateOutputSetMklShape(context,
+                                batch_variance_index,
+                                &batch_variance_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_batch_variance);
+      CHECK_NOTNULL(batch_variance_tensor);
+
+      if (is_training_)
+        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
+      else
+        SetMeanVariance(est_mean_tensor, est_variance_tensor);
+
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      memory::format format_m;
+      if (dnn_shape_src.IsMklTensor()) {
+        if (dnn_shape_src.IsTensorInNCHWFormat()) {
+          format_m = memory::format::nchw;
+        } else {
+          format_m = memory::format::nhwc;
+        }
+      } else {
+        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+      }
+
+      // set src primitive
+      memory::dims src_dims;
+      if (dnn_shape_src.IsMklTensor()) {
+        src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
+                                             tensor_format_);
+      } else {
+        src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
+                                             tensor_format_);
+      }
+
+      auto src_md = dnn_shape_src.IsMklTensor()
+                    ? dnn_shape_src.GetMklLayout()
+                    : memory::desc(src_dims, MklDnnType<T>(), format_m);
+      src.SetUsrMem(src_md, &src_tensor);
+
+      // set weights primitive
+      // MKL-DNN packs scale & shift as "weights":
+      // <scale>...<scale><shift>...<shift>
+      auto weights_desc = memory::desc({2, depth_},
+                                       MklDnnType<T>(),
+                                       memory::format::nc);
+      auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
+      auto weights_m = memory(weights_pd);
+      T* weights_data = reinterpret_cast<T*>(
+                        weights_m.get_data_handle());
+      T* scale_tf = reinterpret_cast<T*>(
+                    const_cast<T*>(scale_tensor.flat<T>().data()));
+      T* shift_tf = reinterpret_cast<T*>(
+                    const_cast<T*>(shift_tensor.flat<T>().data()));
+
+      for (int k=0; k < depth_; k++) {
+        weights_data[k] = scale_tf[k];
+        weights_data[k + depth_] = shift_tf[k];
+      }
+
+      // Mean and variance (without Bessel's correction) saved for backward
+      // computation to serve as pre-computed mean and variance.
+      Tensor* saved_mean_tensor = nullptr;
+      MklDnnShape mkl_shape_saved_mean;
+      mkl_shape_saved_mean.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, saved_mean_index,
+                                &saved_mean_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_saved_mean);
+      CHECK_NOTNULL(saved_mean_tensor);
+
+      Tensor* saved_variance_tensor = nullptr;
+      MklDnnShape mkl_shape_saved_variance;
+      mkl_shape_saved_variance.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, saved_variance_index,
+                                &saved_variance_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_saved_variance);
+      CHECK_NOTNULL(saved_variance_tensor);
+
+      // set mean primitive
+      auto mean_desc = memory::desc({1, depth_},
+                                    MklDnnType<T>(),
+                                    memory::format::nc);
+      auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
+      char* saved_mean_data_tf = reinterpret_cast<char*>
+                                 (saved_mean_tensor->flat<T>().data());
+      std::memcpy(saved_mean_data_tf,
+                  reinterpret_cast<char*>(mean_values_),
+                  depth_*sizeof(T));
+      auto mean_m = memory(mean_pd,
+                           reinterpret_cast<void*>(saved_mean_data_tf));
+
+      // set variance primitive
+      auto variance_desc = memory::desc({1, depth_},
+                                    MklDnnType<T>(),
+                                    memory::format::nc);
+      auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
+      char* saved_variance_data_tf = reinterpret_cast<char*>
+                  (saved_variance_tensor->flat<T>().data());
+      std::memcpy(saved_variance_data_tf,
+                  reinterpret_cast<char*>(variance_values_),
+                  depth_*sizeof(T));
+      auto variance_m = memory(variance_pd, saved_variance_data_tf);
+
+      prop_kind pk = (is_training_) ?
+                     prop_kind::forward_training :
+                     prop_kind::forward_scoring;
+      auto bnrm_fwd_desc = batch_normalization_forward::desc(
+                               pk, src.GetUsrMemDesc(), epsilon_,
+                               is_training_ ? use_scale_shift :
+                               (use_scale_shift | use_global_stats));
+      auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
+                             bnrm_fwd_desc, cpu_engine);
+
+      // allocate dst tensor
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      Tensor* dst_tensor = nullptr;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = bnrm_fwd_pd.dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
+                                  src_dims, format_m);
+        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                                tf_shape_dst, dnn_shape_dst);
+
+      // Output of batchnorm has same shape as input.
+      dst.SetUsrMem(src_md, dst_tensor);
+
+      primitive bnrm_fwd_op;
+      if (is_training_) {
+        bnrm_fwd_op = batch_normalization_forward(
+                          bnrm_fwd_pd,
+                          src.GetOpMem(),
+                          weights_m,
+                          dst.GetOpMem(),
+                          mean_m,
+                          variance_m);
+      } else {
+        bnrm_fwd_op = batch_normalization_forward(
+                          bnrm_fwd_pd,
+                          src.GetOpMem(),
+                          mean_m,
+                          variance_m,
+                          (const primitive::at) weights_m,
+                          dst.GetOpMem());
+      }
+      std::vector<primitive> net;
+      net.push_back(bnrm_fwd_op);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // copy batch_mean data
+      T* batch_mean_data_tf = reinterpret_cast<T*>(
+                                batch_mean_tensor->flat<T>().data());
+      std::memcpy(reinterpret_cast<char*>(batch_mean_data_tf),
+                  reinterpret_cast<char*>(mean_m.get_data_handle()),
+                  depth_*sizeof(T));
+
+      // copy batch_variance data with Bessel's correction
+      // if training mode is on
+      float adjust_factor = 1.0;
+      if (is_training_) {
+        size_t orig_size = src_dims[0] * src_dims[2] * src_dims[3];
+        size_t adjust_size = orig_size - 1;
+        adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
+      }
+      T* batch_variance_data_tf = reinterpret_cast<T*>(
+                                  batch_variance_tensor->flat<T>().data());
+      for (int k=0; k < depth_; k++)
+        batch_variance_data_tf[k] =
+            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k]
+            * adjust_factor;
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) +
+                         ", in file " + string(__FILE__) + ":" +
+                         std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                     errors::Aborted("Operation received an exception:",
+                     error_msg));
+    }
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  bool is_training_;
+  T* mean_values_;
+  T* variance_values_;
+  size_t depth_;          // batch normalization is done for per channel.
+
+  void ExtractParams(OpKernelContext* context) {
+    const Tensor& input = MklGetInput(context, 0);
+    depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+  }
+
+  void SetMeanVariance(const Tensor& mean, const Tensor& variance) {
+    mean_values_ = reinterpret_cast<T*>(
+                       const_cast<T*>(mean.flat<T>().data()));
+    variance_values_ = reinterpret_cast<T*>(
+                       const_cast<T*>(variance.flat<T>().data()));
+  }
+};
+
+
+template <typename Device, typename T>
+class MklFusedBatchNormGradOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      const size_t diff_dst_index = 0;  // index of diff_dst tensor
+      const size_t src_index = 1;       // index of src input tensor
+      const size_t scale_index = 2;     // index of scale tensor
+      const size_t mean_index = 3;      // index of saved_mean tensor
+      const size_t variance_index = 4;  // index of saved_variance tensor
+      const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+      const Tensor& src_tensor = MklGetInput(context, src_index);
+      const Tensor& scale_tensor = MklGetInput(context, scale_index);
+      const Tensor& saved_mean_tensor = MklGetInput(context, mean_index);
+      const Tensor& saved_variance_tensor = MklGetInput(context,
+                                            variance_index);
+
+      MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
+      GetMklShape(context, src_index, &dnn_shape_src);
+      GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+      if (dnn_shape_diff_dst.IsMklTensor()) {
+        OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        diff_dst_tensor.shape().DebugString()));
+      } else {
+        OP_REQUIRES(context, diff_dst_tensor.dims() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        diff_dst_tensor.shape().DebugString()));
+      }
+
+      if (dnn_shape_src.IsMklTensor()) {
+        OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                         src_tensor.shape().DebugString()));
+      } else {
+        OP_REQUIRES(context, src_tensor.dims() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        src_tensor.shape().DebugString()));
+      }
+
+      OP_REQUIRES(context, scale_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "scale must be 1-dimensional",
+                      scale_tensor.shape().DebugString()));
+      OP_REQUIRES(context, saved_mean_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "saved mean must be 1-dimensional",
+                       saved_mean_tensor.shape().DebugString()));
+
+      OP_REQUIRES(context, saved_variance_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "saved variance must be 1-dimensional",
+                      saved_variance_tensor.shape().DebugString()));
+
+      if (dnn_shape_src.IsMklTensor())
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      else
+        ExtractParams(context);
+
+      memory::format format_m;
+      if (dnn_shape_src.IsMklTensor()) {
+        if (dnn_shape_src.IsTensorInNCHWFormat())
+          format_m = memory::format::nchw;
+        else
+          format_m = memory::format::nhwc;
+      } else {
+        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+      }
+
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> mean(&cpu_engine);
+      MklDnnData<T> variance(&cpu_engine);
+      MklDnnData<T> diff_dst(&cpu_engine);
+      MklDnnData<T> diff_src(&cpu_engine);
+
+      memory::dims src_dims, diff_dst_dims;
+      if (dnn_shape_src.IsMklTensor())
+        src_dims = TFShapeToMklDnnDimsInNCHW(
+                       dnn_shape_src.GetTfShape(), tensor_format_);
+      else
+        src_dims = TFShapeToMklDnnDimsInNCHW(
+                       src_tensor.shape(), tensor_format_);
+
+      if (dnn_shape_diff_dst.IsMklTensor())
+        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
+                            dnn_shape_diff_dst.GetTfShape(),
+                            tensor_format_);
+      else
+        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
+                            diff_dst_tensor.shape(),
+                            tensor_format_);
+
+      // set src and diff_dst primitives
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        if (dnn_shape_src.IsMklTensor()) {
+          src_md = dnn_shape_src.GetMklLayout();
+          diff_dst_md = src_md;
+        } else {
+          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+          src_md = diff_dst_md;
+        }
+      } else {
+        src_md =  memory::desc(src_dims, MklDnnType<T>(), format_m);
+        diff_dst_md = src_md;
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+      diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+
+      // weights -- DNN packs scales/shifts as weights in order of
+      // scale, ..., scale, shift, ..., shift
+      auto weights_desc = memory::desc({2, depth_},
+                                       MklDnnType<T>(),
+                                       memory::format::nc);
+      auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
+      auto weights_m = memory(weights_pd);
+      T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
+      T* scale_tf = reinterpret_cast<T*>(const_cast<T*>
+                                        (scale_tensor.flat<T>().data()));
+      for (int k=0; k < depth_; k++) {
+        weights_data[k] = scale_tf[k];
+        weights_data[k + depth_] = 0;
+      }
+
+      // set mean primitive
+      memory::dims mv_dims = GetMeanVarianceDims();
+      mean.SetUsrMem(mv_dims,
+                     memory::format::nc,
+                     const_cast<void*>(static_cast<const void*>
+                     (saved_mean_tensor.flat<T>().data())));
+      mean.SetOpMemDesc(mv_dims, memory::format::nc);
+
+      // set variance primitive
+      variance.SetUsrMem(mv_dims,  memory::format::nc,
+                         const_cast<void*>(static_cast<const void*>
+                         (saved_variance_tensor.flat<T>().data())));
+      variance.SetOpMemDesc(mv_dims, memory::format::nc);
+
+      // set diff_weight primitive
+      auto diff_weights_desc = memory::desc(
+                                 {2, depth_},
+                                 MklDnnType<T>(),
+                                 memory::format::nc);
+      auto diff_weights_pd = memory::primitive_desc(
+                                diff_weights_desc,
+                                cpu_engine);
+      auto diff_weights_m = memory(diff_weights_pd);
+
+      auto bnrm_fwd_desc = batch_normalization_forward::desc(
+                                prop_kind::forward_training,
+                                src.GetUsrMemDesc(),
+                                epsilon_,
+                                use_scale_shift);
+      auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
+                                bnrm_fwd_desc,
+                                cpu_engine);
+
+      // Indices of output tensors
+      const size_t diff_src_index = 0;    // index of diff_src tensor
+      const size_t diff_scale_index = 1;  // index of diff_scale tensor
+      const size_t diff_shift_index = 2;  // index of diff_shift tensor
+      const size_t p1_index = 3;  // index of 1st placeholder tensor
+      const size_t p2_index = 4;  // index of 2nd placeholder tensor
+
+      // allocate diff_src tensor
+      MklDnnShape dnn_shape_diff_src;
+      TensorShape tf_shape_diff_src;
+      Tensor* diff_src_tensor = nullptr;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_diff_src.SetMklTensor(true);
+        auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
+        dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
+        dnn_shape_diff_src.SetElemType(MklDnnType<T>());
+        dnn_shape_diff_src.SetTfLayout(
+                              dnn_shape_src.GetDimension(),
+                              src_dims,
+                              format_m);
+        dnn_shape_diff_src.SetTfDimOrder(
+                              dnn_shape_src.GetDimension(),
+                              tensor_format_);
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_diff_src.SetMklTensor(false);
+        tf_shape_diff_src = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                                tf_shape_diff_src, dnn_shape_diff_src);
+
+      diff_src.SetUsrMem(src_md, diff_src_tensor);
+
+      prop_kind pk = prop_kind::backward;
+      auto bnrm_bwd_desc = batch_normalization_backward::desc(
+                               pk,
+                               diff_src.GetUsrMemDesc(),
+                               src.GetUsrMemDesc(),
+                               epsilon_,
+                               use_scale_shift);
+      auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
+                               bnrm_bwd_desc,
+                               cpu_engine,
+                               bnrm_fwd_pd);
+
+      auto bnrm_bwd_op = batch_normalization_backward(
+                               bnrm_bwd_pd,
+                               src.GetOpMem(),
+                               mean.GetOpMem(),
+                               variance.GetOpMem(),
+                               diff_dst.GetOpMem(),
+                               weights_m,
+                               diff_src.GetOpMem(),
+                               diff_weights_m);
+
+      std::vector<primitive> net;
+      net.push_back(bnrm_bwd_op);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // separate out scale and shift grad and copy to individual tensors
+      const TensorShape& tf_shape_scale_shift = scale_tensor.shape();
+      Tensor* diff_scale_tensor = nullptr;
+      MklDnnShape mkl_shape_diff_scale;
+      mkl_shape_diff_scale.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, diff_scale_index, &diff_scale_tensor,
+                                tf_shape_scale_shift, mkl_shape_diff_scale);
+
+      Tensor* diff_shift_tensor = nullptr;
+      MklDnnShape mkl_shape_diff_shift;
+      mkl_shape_diff_shift.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, diff_shift_index, &diff_shift_tensor,
+                                tf_shape_scale_shift, mkl_shape_diff_shift);
+
+      // copy data: diff_scale and diff_shift
+      T* diff_weights_data_dnn = reinterpret_cast<T*>
+                                 (diff_weights_m.get_data_handle());
+      float* diff_scale_data_tf = const_cast<float*>(
+             static_cast<const float*>(diff_scale_tensor->flat<T>().data()));
+      float* diff_shift_data_tf = const_cast<float*>(
+             static_cast<const float*>(diff_shift_tensor->flat<T>().data()));
+      for (int i = 0; i < depth_; i++) {
+        diff_scale_data_tf[i] = diff_weights_data_dnn[i];
+        diff_shift_data_tf[i] = diff_weights_data_dnn[i + depth_];
+      }
+
+      // Placeholders for estimated_mean and estimated_variance, which are
+      // used for inference and thus not needed here for gradient computation.
+      Tensor* p1_tensor = nullptr, *p2_tensor = nullptr;
+      MklDnnShape mkl_shape_p;
+      mkl_shape_p.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, p1_index, &p1_tensor,
+                                TensorShape({}), mkl_shape_p);
+      AllocateOutputSetMklShape(context, p2_index, &p2_tensor,
+                                TensorShape({}), mkl_shape_p);
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                          ", message: " + string(e.message) +
+                          ", in file " + string(__FILE__) + ":" +
+                          std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                     errors::Aborted("Operation received an exception:",
+                     error_msg));
+    }
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  int depth_;             // batch normalization is done for per channel.
+
+  void ExtractParams(OpKernelContext* context) {
+      const Tensor& input = MklGetInput(context, 0);
+      depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+  }
+
+  memory::dims GetMeanVarianceDims() {
+    return memory::dims({1, depth_});
+  }
+};
+
+#endif
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklFusedBatchNormOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
 
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNormGrad")            \

diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index f31e7af..9ee27ee 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc

@@ -28,8 +28,15 @@
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
  public:
@@ -50,6 +57,32 @@
   bool IsExpensive() override { return false; }
 };
 
+#else
+
+template <typename Device, typename T>
+class MklIdentityOp : public OpKernel {
+ public:
+  explicit MklIdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklDnnShape dnn_shape_input;
+    const int kInputIdx = 0, kOutputIdx = 0;
+    GetMklShape(context, kInputIdx, &dnn_shape_input);
+
+    if (dnn_shape_input.IsMklTensor()) {
+      ForwardMklTensorInToOut(context, kInputIdx, kOutputIdx);
+    } else {
+      ForwardTfTensorInToOut(context, kInputIdx, kOutputIdx);
+    }
+  }
+
+  // TensorFlow's IdentityOp has the following member function, so kept it
+  // as it is.
+  bool IsExpensive() override { return false; }
+};
+
+#endif
+
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklIdentity")                      \
                               .Device(DEVICE_CPU)                   \

diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index b58e44e..001834b 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc

@@ -31,6 +31,12 @@
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -44,15 +50,16 @@
 // else if both inputs are in mkl format:
 //   if both have the same shape:
 //     pass the inputs through to the output
-// 	else:
-// 		convert both to TF
+//   else:
+//     convert both to TF
 // else if one is TF and one is MKL:
-// 	if broadcast is needed:
-// 		convert the MKL format input to TF format
-// 	else:
-// 		convert the TF format input to MKL format
+//   if broadcast is needed:
+//     convert the MKL format input to TF format
+//   else:
+//     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
+#ifndef INTEL_MKL_DNN
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
@@ -242,6 +249,199 @@
   bool has_avx512f_ = false;
 };
 
+#else
+
+template <typename Device, typename T>
+class MklInputConversionOp : public OpKernel {
+ public:
+  explicit MklInputConversionOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
+  }
+
+ private:
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor_0 = MklGetInput(context, 0);
+    MklDnnShape input_shape_0;
+    GetMklShape(context, 0, &input_shape_0);
+
+    const Tensor& input_tensor_1 = MklGetInput(context, 1);
+    MklDnnShape input_shape_1;
+    GetMklShape(context, 1, &input_shape_1);
+
+    bool tf_shapes_are_same = context->input(0).shape() ==
+                              context->input(1).shape();
+
+    VLOG(1) << "MklInputConversionOp: Input shapes are "
+            << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
+            << context->input(0).shape().DebugString() << " and "
+            << context->input(1).shape().DebugString();
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // if both inputs are in TF format, just copy input tensors to output.
+    if (!input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+      VLOG(1) << "MklInputConversionOp: No conversion needed, "
+              << "copying TF inputs to output";
+
+      ForwardTfTensorInToOut(context, 0, 0);
+      ForwardTfTensorInToOut(context, 1, 1);
+      return;
+    }
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // If both inputs are in MKL format
+    if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+      // If both have the same shape, pass them through
+      if (tf_shapes_are_same) {
+        VLOG(1) << "MklInputConversionOp: No conversion needed, "
+                << "copying MKL inputs with identical shapes to output";
+
+        ForwardMklTensorInToOut(context, 0, 0);
+        ForwardMklTensorInToOut(context, 1, 1);
+        return;
+      }
+
+      // Sanity check
+      bool mkl_shapes_are_same = input_shape_0 == input_shape_1;
+      if (mkl_shapes_are_same) {
+        CHECK(false) << "MklInputConversionOp: Unexpected: TF shapes are "
+                        "different but MKL shapes are same";
+      }
+
+      // Both have different shapes, so broadcast will be necessary.
+      // Convert to TF and pass both tensors through (we can't do broadcast
+      // with MKL tensors)
+      VLOG(1) << "MklInputConversionOp: Broadcast needed, "
+              << "converted MKL inputs to TF format";
+
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_, 0);
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_, 1);
+      SetDummyMklShapeOutput(context, 0);
+      SetDummyMklShapeOutput(context, 1);
+      return;
+    }
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // One input is MKL and one is TF. If no broadcast is needed, convert
+    // the TF tensor to MKL, otherwise convert the MKL tensor to TF format
+    VLOG(1) << "MklInputConversionOp: Inputs in different formats (MKL/TF)";
+
+    const Tensor* mkl_tensor;
+    const MklDnnShape* mkl_shape;
+    const Tensor* tf_tensor;
+    MklDnnShape* tf_mkl_shape;
+    uint mkl_tensor_index;
+    uint tf_tensor_index;
+    if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+      mkl_tensor = &input_tensor_0;
+      mkl_shape = &input_shape_0;
+      mkl_tensor_index = 0;
+      tf_tensor = &input_tensor_1;
+      tf_mkl_shape = &input_shape_1;
+      tf_tensor_index = 1;
+    } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+      mkl_tensor = &input_tensor_1;
+      mkl_shape = &input_shape_1;
+      mkl_tensor_index = 1;
+      tf_tensor = &input_tensor_0;
+      tf_mkl_shape = &input_shape_0;
+      tf_tensor_index = 0;
+    } else {
+      CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
+                      "shapes for MKL "
+                   << "element-wise op";
+    }
+
+    // Broadcast is needed if the shapes are not the same
+    bool broadcast_needed;
+
+    size_t in0_size = 1;
+    for (size_t i = 0; i < mkl_shape->GetDimension(); ++i)
+      in0_size *= mkl_shape->TfDimSize(i);
+
+    size_t in1_size = 1;
+    for (size_t i = 0; i < tf_tensor->shape().dims(); ++i)
+      in1_size *= tf_tensor->shape().dim_size(i);
+
+    broadcast_needed = (in0_size != in1_size);
+
+    if (!broadcast_needed) {
+      // Both shapes are same, convert the TF input to MKL
+      VLOG(1) << "MklInputConversionOp: No broadcast needed.";
+      VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index
+              << " to MKL format";
+
+      // Create MklDnnShape for output Mkl tensor.
+      Tensor* tensor_out;
+      MklDnnShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(true);
+      mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+      mkl_output_mkl_shape.SetTfLayout(mkl_shape->GetDimension(),
+                                       mkl_shape->GetSizesAsMklDnnDims(),
+                                       mkl_shape->GetTfDataFormat());
+      // ** Temporarily borrow the layout from the MKL input **
+      auto output_mkl_md = mkl_shape->GetMklLayout();
+      mkl_output_mkl_shape.SetMklLayout(&output_mkl_md);
+
+      // Create output Mkl tensor
+      AllocateOutputSetMklShape(context, tf_tensor_index, &tensor_out,
+                                mkl_tensor->shape(), mkl_output_mkl_shape);
+
+      // Create MklDnnData object for input tensor. Input tensor is in
+      // Tensorflow layout.
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> tf_input(&cpu_engine);
+      auto input_tf_md = mkl_output_mkl_shape.GetTfLayout();
+      tf_input.SetUsrMem(input_tf_md, &tf_tensor);
+
+      // Create reorder between tensorflow layout and Mkl layout.
+      std::vector<primitive> net;
+      CHECK_EQ(tf_input.CheckReorderToOpMem(memory::primitive_desc(
+                                            output_mkl_md, cpu_engine),
+                                            tensor_out, &net),
+               true);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // -- The tensor in MKL format passes through --
+      ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index);
+    } else {
+      // Broadcast is needed, so convert the MKL input to TF
+      VLOG(1) << "MklInputConversionOp: Broadcast needed.";
+      VLOG(1) << "MklInputConversionOp: Converting input " << mkl_tensor_index
+              << " to TF format";
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_,
+                                           mkl_tensor_index);
+      SetDummyMklShapeOutput(context, mkl_tensor_index);
+
+      // The tensor in TF format passes through
+      ForwardTfTensorInToOut(context, tf_tensor_index, tf_tensor_index);
+    }
+
+    VLOG(1) << "MklInputConversionOp: Shapes (output): "
+            << context->mutable_output(0)->shape().DebugString() << " and "
+            << context->mutable_output(1)->shape().DebugString();
+
+    VLOG(1) << "MklInputConversion completed successfully.";
+  }
+
+ private:
+  /// Data format of the operation
+  string data_format_str;
+
+  /// Data type of the operation
+  DataType op_data_type;
+
+  /// CPUIDInfo
+  bool has_avx512f_ = false;
+};
+
+#endif
+
 ///////////////////////////////////////////////////////////
 //               Register kernel
 ///////////////////////////////////////////////////////////
@@ -253,7 +453,10 @@
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklInputConversionOp<CPUDevice, T>);
 
-TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+// TODO(nhasabni): We cannot support all number types since MklDnn does
+// not support types.
+// TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
 #undef REGISTER_CPU
 }  // namespace tensorflow
 #endif  // INTEL_MKL

diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index aa08e93..227765e 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc

@@ -17,7 +17,7 @@
 // See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL
 // layout and primitives, use MKL dnn primitives to compute local
 // response normalization
-
+#undef INTEL_MKL
 #ifdef INTEL_MKL
 
 #define EIGEN_USE_THREADS

diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 846bb57..de4d7d2 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc

@@ -16,17 +16,32 @@
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 #define EIGEN_USE_THREADS
-
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
+#ifdef INTEL_MKL_DNN
+#include <algorithm>
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::error;
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+using mkldnn::padding_kind;
+using mkldnn::engine;
+using mkldnn::prop_kind;
+using mkldnn::algorithm;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
+#ifndef INTEL_MKL_DNN
+
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
 class MklMaxPoolingOp : public OpKernel {
@@ -475,8 +490,348 @@
   TensorFormat data_format_;
 
   bool workspace_enabled_;
+};  // MklMaxPoolingGradOp
+
+#else  // INTEL_MKL_DNN is defined
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
+ public:
+  explicit MklMaxPoolingOp(OpKernelConstruction* context)
+            : MklPoolingForwardOpBase<T>(context) {
+    // In Max Pooling, MKLDNN does not allow passing workspace as NULL.
+    // So we set workspace_enabled_ to true.
+    this->workspace_enabled_ = true;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& input_tensor = MklGetInput(context,
+                this->kInputTensorIndexInput);
+      MklDnnShape dnn_shape_input;
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      this->SanityCheckInput(context, input_tensor, dnn_shape_input);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> dnn_data_input(&cpu_engine);
+      MklDnnData<T> dnn_data_output(&cpu_engine);
+      MklDnnData<T> dnn_data_wksp(&cpu_engine);
+
+      // initialize variables for the pooling op
+      MklPoolParameters pool_params;
+      // Get the input tensor and initialize the pooling parameters
+      this->ConfigureInput(context, dnn_shape_input,
+                        input_tensor, &pool_params,
+                        &dnn_data_input);
+      OP_REQUIRES_OK(context, context->status());
+
+      // Declare output tensor
+      Tensor* output_tensor = nullptr;
+      memory::dims output_dims_mkl_order;
+      this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+      // If input is in Mkl layout, then just get the memory format from it
+      // directly, instead of using input data_format to MaxPool.
+      if (dnn_shape_input.IsMklTensor()) {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  static_cast<memory::format>(
+              dnn_data_input.GetUsrMemDesc().data.format));
+      } else {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  this->data_format_mkldnn_);
+      }
+
+      // describe the memory layout; let mkl-dnn choose the best for the op
+      dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+
+      auto pool_desc = pooling_forward::desc(prop_kind::forward,
+            algorithm::pooling_max,
+            dnn_data_input.GetUsrMemDesc(),
+            dnn_data_output.GetUsrMemDesc(),
+            memory::dims({  pool_params.row_stride,
+                            pool_params.col_stride}),
+            memory::dims({  pool_params.window_rows,
+                            pool_params.window_cols}),
+            memory::dims({  static_cast<int>(pool_params.pad_top),
+                            static_cast<int>(pool_params.pad_left)}),
+            memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                            static_cast<int>(pool_params.pad_right)}),
+            TFPaddingToMklDnnPadding(this->padding_));
+        auto pool_fwd_desc = pooling_forward::primitive_desc(pool_desc,
+            cpu_engine);
+
+      this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order,
+                            this->data_format_mkldnn_, &output_tensor);
+      OP_REQUIRES_OK(context, context->status());
+      dnn_data_output.SetUsrMemDataHandle(output_tensor);
+
+      AllocateWorkspaceTensor(context, pool_fwd_desc, &dnn_data_wksp);
+      OP_REQUIRES_OK(context, context->status());
+
+      this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input,
+                        &dnn_data_output, &dnn_data_wksp);
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+                        ", message: " + string(e.message) +
+                        ", in file " + string(__FILE__) + ":" +
+                        std::to_string(__LINE__);
+        OP_REQUIRES_OK(context,
+                        errors::Aborted("Compute received an exception:",
+                                         error_msg));
+    }
+  }  // Compute
+
+ private:
+    const int kOutputTensorIndexWorkspace = 1;
+
+    void AllocateWorkspaceTensor(OpKernelContext* context,
+                const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+                MklDnnData<T>* dnn_data_wksp) {
+        CHECK_NOTNULL(dnn_data_wksp);
+        Tensor* workspace_tensor = nullptr;
+        memory::primitive_desc workspace_pd
+                    = pool_fwd_prim_desc.workspace_primitive_desc();
+        size_t workspace_t_elems = this->GetNumTElements(workspace_pd);
+        MklDnnShape workspace_mkl_shape;
+        workspace_mkl_shape.SetMklTensor(false);
+        TensorShape workspace_tf_shape;
+        workspace_tf_shape.AddDim(workspace_t_elems);
+        AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
+                                &workspace_tensor,
+                                workspace_tf_shape, workspace_mkl_shape);
+        CHECK_NOTNULL(workspace_tensor);
+        dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+    }
 };
 
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
+ public:
+  explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
+      : MklPoolingBackwardOpBase<T>(context) {
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+        auto cpu_engine = engine(engine::cpu, 0);
+        const Tensor& orig_input_tensor = MklGetInput(context,
+            kInputTensorIndexOrigInput);
+        const Tensor& orig_output_tensor = MklGetInput(context,
+            kInputTensorIndexOrigOutput);
+        const Tensor& grad_tensor = MklGetInput(context,
+            kInputTensorIndexGradient);
+        const Tensor& workspace_tensor = MklGetInput(context,
+            kInputTensorIndexWorkspace);
+        MklDnnShape orig_input_mkl_shape,
+                    orig_output_mkl_shape,
+                    grad_mkl_shape,
+                    workspace_mkl_shape;
+        GetMklShape(context, kInputTensorIndexOrigInput,
+            &orig_input_mkl_shape);
+        GetMklShape(context, kInputTensorIndexOrigOutput,
+            &orig_output_mkl_shape);
+        GetMklShape(context, kInputTensorIndexGradient,
+            &grad_mkl_shape);
+        GetMklShape(context, kInputTensorIndexWorkspace,
+            &workspace_mkl_shape);
+
+        SanityCheckInputs(context,
+                            orig_input_tensor, orig_output_tensor,
+                            grad_tensor, workspace_tensor,
+                            orig_input_mkl_shape, orig_output_mkl_shape,
+                            grad_mkl_shape, workspace_mkl_shape);
+        if (!context->status().ok()) return;
+
+        MklDnnData<T> grad_dnn_data(&cpu_engine);
+        MklDnnData<T> workspace_dnn_data(&cpu_engine);
+        MklDnnData<T> output_dnn_data(&cpu_engine);
+        Tensor* output_tensor = nullptr;
+        MklPoolParameters pool_params;
+        TensorShape orig_input_shape;
+        memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
+        memory::desc original_input_md = ConfigureOriginalInput(context,
+                                orig_input_tensor,
+                                orig_input_mkl_shape,
+                                &orig_input_dims_mkl_order,
+                                &pool_params,
+                                &orig_input_shape);
+
+        memory::desc original_output_md = this->ConfigureOriginalOutput(
+                                pool_params,
+                                orig_output_mkl_shape,
+                                output_dims_mkl_order);
+
+        memory::desc target_diff_dst_md =  this->ConfigureInputGradient(
+                                        grad_mkl_shape,
+                                        grad_tensor,
+                                        &grad_dnn_data,
+                                        original_output_md);
+
+        output_dnn_data.SetUsrMem(original_input_md);
+
+        // Create the forward pooling primitive descriptor so we can
+        // pass it as a hint to the backward pooling primitive descriptor
+        auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
+                algorithm::pooling_max,
+                original_input_md,
+                original_output_md,
+                memory::dims({  pool_params.row_stride,
+                                pool_params.col_stride}),
+                memory::dims({  pool_params.window_rows,
+                                pool_params.window_cols}),
+                memory::dims({  static_cast<int>(pool_params.pad_top),
+                                static_cast<int>(pool_params.pad_left)}),
+                memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                                static_cast<int>(pool_params.pad_right)}),
+                TFPaddingToMklDnnPadding(this->padding_));
+        auto pool_fwd_prim_desc
+                = pooling_forward::primitive_desc(pool_fwd_desc,
+                                                    cpu_engine);
+
+        auto pool_bkwd_desc = pooling_backward::desc(
+                algorithm::pooling_max,
+                output_dnn_data.GetUsrMemDesc(),
+                target_diff_dst_md,
+                memory::dims({  pool_params.row_stride,
+                                pool_params.col_stride}),
+                memory::dims({  pool_params.window_rows,
+                                pool_params.window_cols}),
+                memory::dims({  static_cast<int>(pool_params.pad_top),
+                                static_cast<int>(pool_params.pad_left)}),
+                memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                                static_cast<int>(pool_params.pad_right)}),
+                TFPaddingToMklDnnPadding(this->padding_));
+        auto pool_bkwd_prim_desc
+            = pooling_backward::primitive_desc(pool_bkwd_desc,
+                                                cpu_engine,
+                                                pool_fwd_prim_desc);
+
+        this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+            orig_input_dims_mkl_order,
+            this->data_format_mkldnn_,
+            &output_tensor);
+        output_dnn_data.SetUsrMemDataHandle(output_tensor);
+
+        ConfigureWorkspace(workspace_tensor,
+                pool_fwd_prim_desc.workspace_primitive_desc(),
+                &workspace_dnn_data);
+        this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
+                            &grad_dnn_data,
+                            &output_dnn_data,
+                            memory::primitive_desc(
+                                target_diff_dst_md,
+                                cpu_engine),
+                            &workspace_dnn_data);
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+                        ", message: " + string(e.message) +
+                        ", in file " + string(__FILE__) + ":" +
+                        std::to_string(__LINE__);
+        OP_REQUIRES_OK(context,
+                        errors::Aborted("Compute received an exception:",
+                                         error_msg));
+    }
+  }  // Compute
+
+ private:
+    // .Input("orig_input: T")
+    // .Input("orig_output: T")
+    // .Input("grad: T")
+    // .Input("workspace: T")
+    const int kInputTensorIndexOrigInput = 0;
+    const int kInputTensorIndexOrigOutput = 1;
+    const int kInputTensorIndexGradient = 2;
+    const int kInputTensorIndexWorkspace = 3;
+    //  Output("output: T") in Base Class
+
+    memory::desc ConfigureOriginalInput(OpKernelContext* context,
+                                const Tensor& tensor_original_input,
+                                const MklDnnShape& original_input_mkl_shape,
+                                memory::dims* original_input_dims_mkl_order,
+                                MklPoolParameters* pool_params,
+                                TensorShape* input_tensor_shape) {
+        *input_tensor_shape = tensor_original_input.shape();
+        return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+                                        context,
+                                        tensor_original_input,
+                                        original_input_mkl_shape,
+                                        original_input_dims_mkl_order,
+                                        pool_params,
+                                        *input_tensor_shape);
+    }
+
+    void ConfigureWorkspace(const Tensor& workspace_tensor,
+                        memory::primitive_desc workspace_pd,
+                        MklDnnData<T> *workspace_dnn_data) {
+        CHECK_NOTNULL(workspace_dnn_data);
+
+        workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+    }
+
+    void SanityCheckInputs(OpKernelContext* context,
+                            const Tensor& orig_input_tensor,
+                            const Tensor& orig_output_tensor,
+                            const Tensor& grad_tensor,
+                            const Tensor& workspace_tensor,
+                            const MklDnnShape& orig_input_mkl_shape,
+                            const MklDnnShape& orig_output_mkl_shape,
+                            const MklDnnShape& grad_mkl_shape,
+                            const MklDnnShape& workspace_mkl_shape) {
+        if (!orig_input_mkl_shape.IsMklTensor()) {
+            OP_REQUIRES(context, orig_input_tensor.dims() == 4,
+                errors::InvalidArgument("Original input shape must be "
+                "4-dimensional"));
+        } else {
+            OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
+                    errors::InvalidArgument("Original input shape must be "
+                    "4-dimensional"));
+        }
+        if (!orig_output_mkl_shape.IsMklTensor()) {
+            OP_REQUIRES(context, orig_output_tensor.dims() == 4,
+                errors::InvalidArgument("Original output must be "
+                        "4-dimensional"));
+        } else {
+            OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
+                    errors::InvalidArgument("Original output must be "
+                    "4-dimensional"));
+        }
+        if (!grad_mkl_shape.IsMklTensor()) {
+            OP_REQUIRES(context, grad_tensor.dims() == 4,
+                errors::InvalidArgument("Gradient must be 4-dimensional"));
+        } else {
+            OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
+                    errors::InvalidArgument("Gradient must be "
+                    "4-dimensional"));
+        }
+        if (this->workspace_enabled_){
+            // The workspace should not be an MKL tensor
+            OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
+                    errors::InvalidArgument("Workspace tensor should not"
+                                            " be an MKL Tensor."));
+            // It should only have one dimension
+            OP_REQUIRES(context, workspace_tensor.dims() == 1,
+                    errors::InvalidArgument("Workspace tensor must be "
+                                "1-dimensional"));
+        } else {
+            OP_REQUIRES(context, this->workspace_enabled_,
+                    errors::Unimplemented("MKL-DNN Max Pooling does not "
+                                "yet support the use case "
+                                "where MaxPoolGrad is called without first"
+                                " calling MaxPool."));
+        }
+    }
+};  // MklMaxPoolingGradOp
+
+#endif  // INTEL_MKL_DNN
+
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")

diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 65e8852..f7cadff 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc

@@ -14,10 +14,13 @@
 ==============================================================================*/
 
 #ifdef INTEL_MKL
+
 #include <vector>
+#include <limits>
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
@@ -39,6 +42,7 @@
   Init(context, ksize, stride, padding, data_format);
 }
 
+#ifndef INTEL_MKL_DNN
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -53,7 +57,22 @@
 
   Init(context, ksize, stride, padding, data_format);
 }
+#else
+// Initialization for MKL format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const MklDnnShape* mklInputShape) {
+  // Get the input sizes
+  depth = mklInputShape->GetDimension('C');
+  tensor_in_cols = mklInputShape->GetDimension('W');
+  tensor_in_rows = mklInputShape->GetDimension('H');
+  tensor_in_batch = mklInputShape->GetDimension('N');
 
+  Init(context, ksize, stride, padding, data_format);
+}
+#endif  // INTEL_MKL_DNN
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -80,7 +99,7 @@
                   "MaxPooling supports exactly one of pooling across depth "
                   "or pooling across width/height."));
 
-  if (depth_window == 1) {
+  if (depth_window == 1) {  // we are pooling in the H and W
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_rows, window_rows, row_stride,
                                 padding, &out_height, &pad_top, &pad_bottom));
@@ -88,7 +107,21 @@
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-  } else {
+#ifdef INTEL_MKL_DNN
+    // TF can work with int64, but mkldnn only supports int32
+    // Fail if the height or width are greater than MAX_INT
+
+    OP_REQUIRES(context, FastBoundsCheck(out_height,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("output height is too large"));
+
+    OP_REQUIRES(context, FastBoundsCheck(out_width,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("output width is too large"));
+
+#endif
+    out_depth = depth;  // output will have the same depth as the input
+  } else {  // we are pooling in the depth dimension
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the depth
     // stride (no overlapping).
@@ -109,7 +142,6 @@
                 errors::Unimplemented("Depthwise max pooling is currently "
                                       "only implemented for CPU devices."));
 
-    pad_depth = 0;
     out_depth = depth / depth_window;
   }
 }

diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 92ea2be..d33e91a 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h

@@ -18,9 +18,18 @@
 
 #ifdef INTEL_MKL
 #include <vector>
+#include <string>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -51,14 +60,28 @@
   int pad_depth;
 
   TensorFormat data_format;
+  MklPoolParameters()
+    : depth(0)
+    , tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0)
+    , window_rows(0), window_cols(0), depth_window(0)
+    , row_stride(0), col_stride(0), depth_stride(0)
+    , out_height(0), out_width(0), out_depth(0)
+    , pad_left(0), pad_right(0), pad_top(0), pad_bottom(0), pad_depth(0)
+    , data_format(TensorFormat::FORMAT_NCHW) {}
 
   // Updates context->status if there is an invalid input.
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
+#ifndef INTEL_MKL_DNN
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklShape* mkl_in_shape);
+#else
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const MklDnnShape* mkl_in_shape);
+#endif
 
  private:
   // Common initialization for TensorFlow and MKL formats
@@ -67,6 +90,325 @@
             TensorFormat data_format);
 };
 
+#ifdef INTEL_MKL_DNN
+
+template <class T>
+class MklPoolingOpBase : public OpKernel {
+ public:
+  explicit MklPoolingOpBase(OpKernelConstruction* context)
+            : OpKernel(context)
+            , workspace_enabled_(false) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context,
+            FormatFromString(data_format, &this->data_format_tf_),
+            errors::InvalidArgument("Invalid data format"));
+      this->data_format_mkldnn_
+                = TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
+      OP_REQUIRES(context, this->ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
+      OP_REQUIRES(context, this->stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window strides field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
+      OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
+                  errors::Unimplemented("Pooling is not yet supported on the "
+                                        "batch dimension."));
+
+      // We may not get this attribute for this node if it does not go through
+      // graph rewrite pass. So we do not check for error while retrieving this
+      // attribute value.
+      context->GetAttr("workspace_enabled", &this->workspace_enabled_);
+    }
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  // Calculate output shape of pooling op in MKL-DNN and TensorFlow order.
+  // MKL-DNN uses NCHW for output order. But TensorFlow output will be in
+  // NHWC or NCHW format depending on data format. Function expects
+  // output height and output width to have already been int32
+  // bounds-checked
+  void GetOutputDims(const MklPoolParameters& mkl_pool_params,
+                    memory::dims* output_dims_mkl_order) {
+    // MKL-DNN always needs output in NCHW format.
+    *output_dims_mkl_order = { mkl_pool_params.tensor_in_batch,
+                              mkl_pool_params.out_depth,
+                              static_cast<int>(mkl_pool_params.out_height),
+                              static_cast<int>(mkl_pool_params.out_width)};
+  }
+
+  void InitMklPoolParameters(OpKernelContext* context,
+                      MklPoolParameters* pool_params,
+                      const MklDnnShape& original_input_mkl_shape,
+                      const TensorShape& input_tensor_shape) {
+    if (!original_input_mkl_shape.IsMklTensor()) {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+          this->data_format_tf_, input_tensor_shape);
+    } else {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+          this->data_format_tf_, &original_input_mkl_shape);
+    }
+  }
+
+  // Checks to make sure that the memory we need to allocate
+  // is a multiple of sizeof(T)
+  // returns the number of elements
+  size_t GetNumTElements(const memory::primitive_desc& pd) {
+    size_t num_bytes = pd.get_size();
+    size_t ret_val = num_bytes / sizeof(T);
+    if ( num_bytes % sizeof(T) != 0 ) {
+        ret_val++;
+    }
+    return ret_val;
+  }
+
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_tf_;
+  memory::format data_format_mkldnn_;
+  bool workspace_enabled_;
+};
+
+template <class T>
+class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingForwardOpBase<T>(OpKernelConstruction* context)
+      : MklPoolingOpBase<T>(context) {}
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  void ConfigureInput(OpKernelContext* context,
+                    const MklDnnShape& input_mkl_shape,
+                    const Tensor& input_tensor,
+                    MklPoolParameters* pool_params,
+                    MklDnnData<T>* dnn_data_input) {
+    CHECK_NOTNULL(pool_params);
+    CHECK_NOTNULL(dnn_data_input);
+    TensorShape input_tensor_shape = input_tensor.shape();
+    memory::desc input_md = input_mkl_shape.IsMklTensor()
+                        ? input_mkl_shape.GetMklLayout()
+                        : memory::desc(
+                              TFShapeToMklDnnDimsInNCHW(
+                                  input_tensor_shape, this->data_format_tf_),
+                              MklDnnType<T>(),
+                              this->data_format_mkldnn_);
+    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    this->InitMklPoolParameters(context, pool_params,
+                      input_mkl_shape, input_tensor_shape);
+  }
+
+  void AllocateOutputTensor(OpKernelContext* context,
+            const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+            const memory::dims output_dims_mkl_order,
+            const memory::format& output_tf_format,
+            Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc();
+
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                              output_dims_mkl_order,
+                              output_tf_format);
+    TensorShape output_tf_shape;
+
+    // only allocate enough space for the elements we need.
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
+                            output_tensor,
+                            output_tf_shape, output_mkl_shape);
+    CHECK_NOTNULL(*output_tensor);
+  }
+
+  void PrepareAndExecuteNet(
+                  const pooling_forward::primitive_desc& pool_fwd_desc,
+                  const MklDnnData<T>* src,
+                  MklDnnData<T>* dst,
+                  MklDnnData<T>* wksp = nullptr) {
+    std::vector<primitive> net;
+
+    // Create pooling primitive and add it to net
+    if (wksp != nullptr) {
+        net.push_back(pooling_forward(pool_fwd_desc,
+                        src->GetOpMem(),
+                        dst->GetOpMem(),
+                        wksp->GetOpMem()));
+    } else {
+        net.push_back(pooling_forward(pool_fwd_desc,
+            src->GetOpMem(),
+            dst->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+
+  void SanityCheckInput(OpKernelContext* context,
+                  const Tensor& input_tensor,
+                  const MklDnnShape& input_mkl_shape) {
+    if (!input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, input_tensor.dims() == 4,
+          errors::InvalidArgument("Input must be 4-dimensional"));
+    } else {
+        OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
+                errors::InvalidArgument("Input shape must be "
+                "4-dimensional"));
+    }
+  }
+  // .Input("value: T")
+  // .Output("output: T")
+  const int kInputTensorIndexInput = 0;
+  const int kOutputTensorIndexOutput = 0;
+};  // MklPoolingForwardBaseOp
+
+
+template <class T>
+class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingBackwardOpBase<T>(OpKernelConstruction* context)
+          : MklPoolingOpBase<T>(context) { }
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  const int kOutputTensorIndexOutput = 0;
+
+  void AllocateOutputTensor(OpKernelContext* context,
+            const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
+            const memory::dims output_dims_mkl_order,
+            const memory::format& output_tf_format,
+            Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd
+                = pool_bkwd_prim_desc.diff_src_primitive_desc();
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                              output_dims_mkl_order,
+                              output_tf_format);
+
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
+                            output_tensor,
+                            output_tf_shape, output_mkl_shape);
+    CHECK_NOTNULL(*output_tensor);
+  }
+
+  void PrepareAndExecuteNet(
+    const pooling_backward::primitive_desc& pool_bkwd_desc,
+    MklDnnData<T>* input_gradient_diff_dst,
+    MklDnnData<T>* output_diff_src,
+    const memory::primitive_desc& target_diff_dst_pd,
+    const MklDnnData<T>* workspace = nullptr) {
+
+    std::vector<primitive> net;
+
+    // If the input gradient isn't in the same format as the output
+    // reorder it to the same format as the output
+    input_gradient_diff_dst->CheckReorderToOpMem(
+            target_diff_dst_pd,
+            &net);
+
+    // Create pooling primitive and add it to net
+    if (nullptr == workspace) {
+      net.push_back(pooling_backward(pool_bkwd_desc,
+                              input_gradient_diff_dst->GetOpMem(),
+                              output_diff_src->GetOpMem()));
+    } else {
+      net.push_back(pooling_backward(pool_bkwd_desc,
+                                  input_gradient_diff_dst->GetOpMem(),
+                                  workspace->GetOpMem(),
+                                  output_diff_src->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  // Max Pooling and Avg Pooling have slightly different implementations
+  // Takes the Tensor containing original input data and the original
+  // mkl Dnn Shape and populates other data
+  memory::desc ConfigureOriginalInput(OpKernelContext* context,
+                              const Tensor& tensor_original_input_shape,
+                              const MklDnnShape& original_input_mkl_shape,
+                              memory::dims* original_input_dims_nchw,
+                              MklPoolParameters* pool_params,
+                              const TensorShape& input_tensor_shape) {
+    CHECK_NOTNULL(original_input_dims_nchw);
+    CHECK_NOTNULL(pool_params);
+    this->InitMklPoolParameters(context, pool_params,
+                          original_input_mkl_shape,
+                          input_tensor_shape);
+
+    *original_input_dims_nchw
+          = original_input_mkl_shape.IsMklTensor()
+          ? original_input_mkl_shape.GetSizesAsMklDnnDims()
+          : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+        this->data_format_tf_);
+
+    return  original_input_mkl_shape.IsMklTensor()
+      ? original_input_mkl_shape.GetMklLayout()
+      : memory::desc(*original_input_dims_nchw,
+                      MklDnnType<T>(),
+                      this->data_format_mkldnn_);
+  }
+
+  memory::desc ConfigureOriginalOutput(const MklPoolParameters& pool_params,
+                                const MklDnnShape& original_output_mkl_shape,
+                                      memory::dims output_dims_mkl_order) {
+    this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+    return original_output_mkl_shape.IsMklTensor()
+            ? original_output_mkl_shape.GetMklLayout()
+            : memory::desc(output_dims_mkl_order,
+                         MklDnnType<T>(),
+                         this->data_format_mkldnn_);
+  }
+
+  memory::desc ConfigureInputGradient(
+        const MklDnnShape& input_gradient_mkl_shape,
+        const Tensor& input_gradient_tensor,
+        MklDnnData<T>* input_gradient_dnn_data,
+        const memory::desc& original_output_md) {
+    // Configure the gradient as is
+    memory::desc original_input_grad_md
+          = input_gradient_mkl_shape.IsMklTensor()
+          ? input_gradient_mkl_shape.GetMklLayout()
+          : memory::desc(TFShapeToMklDnnDimsInNCHW(
+                    input_gradient_tensor.shape(),
+                    this->data_format_tf_),
+                    MklDnnType<T>(), this->data_format_mkldnn_);
+
+    input_gradient_dnn_data->SetUsrMem(original_input_grad_md,
+                &input_gradient_tensor);
+
+    // Check to see if input grad diff dst is in the right format
+    // Create a new memory descriptor with the same shape as the
+    // original, but the format of the other tensors.
+    memory::format original_output_format =
+            static_cast<memory::format>(original_output_md.data.format);
+    bool grad_reorder_needed = input_gradient_dnn_data->IsReorderNeeded(
+                                    original_output_format);
+    memory::dims diff_dst_dims = input_gradient_mkl_shape.IsMklTensor()
+        ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
+        : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                    this->data_format_tf_);
+    memory::desc target_diff_dst_md = memory::desc(diff_dst_dims,
+        MklDnnType<T>(), original_output_format);
+
+    return grad_reorder_needed
+            ? target_diff_dst_md
+            : original_input_grad_md;
+  }
+};
+#endif  // INTEL_MKL_DNN
+
 //-------------------------------------------------------------------
 // Utility functions
 

diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 86a77d7..45bdd0a 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc

@@ -28,6 +28,19 @@
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::prop_kind;
+using mkldnn::algorithm;
+using mkldnn::relu_forward;
+using mkldnn::relu_backward;
+using mkldnn::eltwise_relu;
+using mkldnn::eltwise_elu;
+using mkldnn::eltwise_tanh;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -45,6 +58,8 @@
   }
 };
 
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklReluOp : public OpKernel {
  public:
@@ -59,6 +74,7 @@
     GetMklShape(context, 0, &mkl_context.input_shape);
     void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
     bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
     if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
       const TensorShape& o_shape = input.shape();
       Tensor* out_tensor = nullptr;
@@ -164,6 +180,7 @@
   } MklReluOpContext;
 };
 
+
 template <typename Device, typename T>
 class MklReluGradOp : public OpKernel {
  public:
@@ -189,18 +206,18 @@
       const Tensor& a = MklGetInput(context, 1);
       void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
       void* mkl_buffer_convert = nullptr;
+
       dnnPrimitive_t cv_input_to_grad = nullptr;
 
-      // if input and grad are not in the same layout, do a conversion between
-      // them.
+      // if input and grad are not in the same layout,
+      // do a conversion between them.
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
         CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
                    lt_grad), E_SUCCESS);
         CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert),
-                 E_SUCCESS);
+                                          mkl_buffer_convert), E_SUCCESS);
         relu_res[dnnResourceSrc] = mkl_buffer_convert;
         dnnDelete_F32(cv_input_to_grad);
       } else {
@@ -246,7 +263,6 @@
 };
 
 template <typename Device, typename T>
-
 void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   MklReluGradOpContext mkl_context;
   const Tensor& g = MklGetInput(context, 0);
@@ -264,20 +280,21 @@
       !MklReluHelpers::ValidateSameSize(context, g, a))
     return;
   Tensor* output = nullptr;
-  if (!input_is_mkl && !grad_is_mkl &&
-      !a.dims()) {  // handle the case of a scalar
-    // Allocate space for g and
+
+  if (!input_is_mkl && !grad_is_mkl && !a.dims()) {
+    // handle the scalar case
     const TensorShape& g_shape = g.shape();
     mkl_context.output_shape.SetMklTensor(false);
     AllocateOutputSetMklShape(context, 0, &output, g_shape,
                               mkl_context.output_shape);
+
     void* out_o = static_cast<void*>(output->flat<T>().data());
     (static_cast<T*>(out_o))[0] =
         (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
     return;
   }
 
-  // Generate size, stride for input if input/grad is in MKL format.
+  // generate size, stride for input if input/grad is in mkl format.
   if (grad_is_mkl || input_is_mkl) {
     const MklShape* tmp_mkl_shape =
         (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
@@ -308,21 +325,20 @@
   float negative_slope = 0.0;
   CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
                                      mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope),
-           E_SUCCESS);
+                                     negative_slope), E_SUCCESS);
   Tensor mkl_tmp_input_buf_tensor;
   mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
 
   if (input_is_mkl ||
-      grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
+      grad_is_mkl) { /*if  grad or input are mkl leave it in mkl*/
     TensorShape tf_shape;
     mkl_context.output_shape.SetMklTensor(true);
     mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
                                           dnnResourceDiffSrc);
     mkl_context.output_shape.SetTfLayout(
         mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-    // If input_is_mkl or grad_is_mkl, then we copy strides and sizes from Mkl
-    // shape of one that is in MKL layout.
+    // if input_is_mkl or grad_is_mkl, then we copy strides and sizes from mkl
+    // shape of one that is in mkl layout.
     if (grad_is_mkl == true) {
       mkl_context.output_shape.SetTfDimOrder(
           mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
@@ -332,11 +348,9 @@
     }
 
     tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                        mkl_context.output_shape.GetMklLayout())) /
-                    sizeof(T));
+                    mkl_context.output_shape.GetMklLayout())) / sizeof(T));
     AllocateOutputSetMklShape(context, 0, &output, tf_shape,
                               mkl_context.output_shape);
-
   } else {
     const TensorShape& o_shape = g.shape();
     mkl_context.output_shape.SetMklTensor(false);
@@ -347,13 +361,430 @@
   mkl_context.relu_res[dnnResourceDiffSrc] =
       static_cast<void*>(output->flat<T>().data());
 
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
-           E_SUCCESS);
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd,
+                          mkl_context.relu_res),
+                          E_SUCCESS);
   mkl_context.MklCleanup();
 }
 
-/* Register DNN kernels for supported operations and supported types - right now
- * it is only Relu and f32*/
+
+#else  // INTEL_MKL_DNN
+
+template <typename Device, typename T, algorithm alg_kind>
+class MklReluOpBase : public OpKernel {
+ public:
+  ~MklReluOpBase() {}
+
+  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {
+  }
+
+  virtual void Compute_Scalar(OpKernelContext* context) = 0;
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const size_t src_index = 0;  // index of src input tensor
+      const size_t dst_index = 0;  // index of dst output tensor
+      const Tensor& src_tensor = MklGetInput(context, src_index);
+      MklDnnShape dnn_shape_src;
+      GetMklShape(context, src_index, &dnn_shape_src);
+
+      Tensor* dst_tensor = nullptr;
+      if (src_tensor.dims() == 0) {
+        Compute_Scalar(context);
+        return;
+      }
+
+      // Create relu primitive.
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      // Set DNN primitive - src
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor()) {
+        src_md = dnn_shape_src.GetMklLayout();
+      } else {
+        auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
+        auto src_strides = CalculateTFStrides(src_dims);
+        // Create blocked memory descriptor
+        src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+
+      T alpha = 0, beta = 0;
+      std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+          // Operator memory descriptor is same as user memory descriptor.
+                                              alg_kind, src.GetUsrMemDesc(),
+                                              alpha, beta);
+      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
+                                                         cpu_engine));
+
+      // allocate dst tensor
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = relu_fwd_pd->dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
+                                  dnn_shape_src.GetSizesAsMklDnnDims(),
+                                  dnn_shape_src.GetTfDataFormat());
+        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, dst_index, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
+
+      // Destination memory descriptor is same as source memory descriptor.
+      auto dst_md = src_md;
+      dst.SetUsrMem(dst_md, dst_tensor);
+
+      // execute net
+      std::vector<primitive> net;
+      auto relu_fwd = relu_forward(*relu_fwd_pd, src.GetOpMem(),
+                                   dst.GetOpMem());
+      net.push_back(relu_fwd);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) +
+                         ", in file " + string(__FILE__) + ":" +
+                         std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                     errors::Aborted("Operation received an exception:",
+                        error_msg));
+    }
+  }
+};
+
+
+template <typename Device, typename T, algorithm alg_kind>
+class MklReluGradOpBase : public OpKernel {
+ public:
+  ~MklReluGradOpBase() {}
+
+  explicit MklReluGradOpBase(OpKernelConstruction* context) :
+    OpKernel(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) = 0;
+
+  void Compute(OpKernelContext* context)  {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> diff_dst(&cpu_engine);
+      MklDnnData<T> diff_src(&cpu_engine);
+
+      const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+      const size_t src_index = 1;       // index of src input tensor
+      const size_t diff_src_index = 0;  // index of diff_src output tensor
+
+      const Tensor& src_tensor      = MklGetInput(context, src_index);
+      const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+      Tensor* diff_src_tensor       = nullptr;
+
+      MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
+      GetMklShape(context, src_index, &dnn_shape_src);
+      GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+      int src_dims_size = src_tensor.dims();
+      if (src_dims_size == 0) {
+        Compute_Scalar(context);
+        return;
+      }
+
+      // Set DNN primitives for src & diff_dst
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        if (dnn_shape_diff_dst.IsMklTensor()) {
+          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+          src_md = diff_dst_md;
+        } else {
+          src_md = dnn_shape_src.GetMklLayout();
+          diff_dst_md = src_md;
+        }
+      } else {
+        auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
+        auto src_strides = CalculateTFStrides(src_dims);
+        src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
+        diff_dst_md = src_md;
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+      diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+
+      T alpha = 0, beta = 0;
+      std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+                                              alg_kind, src_md, alpha, beta);
+      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
+                                                         cpu_engine));
+      auto relu_bwd_desc = relu_backward::desc(alg_kind, diff_dst_md, src_md,
+                                                alpha, beta);
+      auto relu_bwd_pd  = relu_backward::primitive_desc(relu_bwd_desc,
+                                                cpu_engine, *relu_fwd_pd);
+
+      // allocate diff_src tensor
+      MklDnnShape dnn_shape_diff_src;
+      TensorShape tf_shape_diff_src;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_diff_src.SetMklTensor(true);
+        auto diff_src_pd = relu_bwd_pd.diff_src_primitive_desc();
+        dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
+        dnn_shape_diff_src.SetElemType(MklDnnType<T>());
+        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
+                                       dnn_shape_src.GetSizesAsMklDnnDims(),
+                                       dnn_shape_src.GetTfDataFormat());
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_diff_src.SetMklTensor(false);
+        tf_shape_diff_src = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                                 tf_shape_diff_src, dnn_shape_diff_src);
+
+      // diff_src memory descriptor is same as diff_dst memory descriptor.
+      auto diff_src_md = diff_dst_md;
+      diff_src.SetUsrMem(diff_src_md, diff_src_tensor);
+
+      PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
+     } catch (mkldnn::error &e) {
+       string error_msg = "Status: " + std::to_string(e.status) +
+                          ", message: " + string(e.message) +
+                          ", in file " + string(__FILE__) + ":" +
+                          std::to_string(__LINE__);
+       OP_REQUIRES_OK(context,
+                      errors::Aborted("Operation received an exception:",
+                                      error_msg));
+    }
+  }
+
+  void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc,
+                  MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>*
+                  diff_dst) {
+    std::vector<primitive> net;
+    net.push_back(relu_backward(relu_prim_desc, src->GetOpMem(),
+                                diff_dst->GetOpMem(), diff_src->GetOpMem()));
+    stream(stream::kind::eager).submit(net).wait();
+  }
+};
+
+
+template <typename Device, typename T>
+class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklReluOp() {}
+
+  explicit MklReluOp(OpKernelConstruction* context) :
+  MklReluOpBase<Device, T, eltwise_relu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i = static_cast<void*>(const_cast<T*>(
+                         src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    (static_cast<T*>(out_o))[0] =
+              std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklReluGradOp() {}
+
+  explicit MklReluGradOp(OpKernelConstruction* context) :
+  MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    int src_dims_size = src_tensor.dims();
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    void* user_g =
+          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
+                                  ((static_cast<T*>(user_i))[0] > 0);
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
+ public:
+  ~MklEluOp() {}
+
+  explicit MklEluOp(OpKernelConstruction* context) :
+  MklReluOpBase<Device, T, eltwise_elu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i = static_cast<void*>(const_cast<T*>(
+                         src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    // return exp(feature) - 1 if feature > 0; feature otherwise
+    T feature = (static_cast<T*>(user_i))[0];
+    if (feature < 0)
+      (static_cast<T*>(out_o))[0] = std::exp(feature);
+    else
+      (static_cast<T*>(out_o))[0] = feature;
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
+ public:
+  ~MklEluGradOp() {}
+
+  explicit MklEluGradOp(OpKernelConstruction* context) :
+  MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    int src_dims_size = src_tensor.dims();
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    void* user_g =
+          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise
+    T feature = (static_cast<T*>(user_i))[0];
+    if (feature > 0) {
+      (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0];
+    } else {
+      T elu = std::exp(feature) - 1;
+      (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] * (elu + 1);
+    }
+  }
+};
+
+template <typename Device, typename T>
+class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
+ public:
+  ~MklTanhOp() {}
+
+  explicit MklTanhOp(OpKernelConstruction* context) :
+  MklReluOpBase<Device, T, eltwise_tanh>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i = static_cast<void*>(const_cast<T*>(
+                         src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    // tanh(x) = (e^x - e^(-x))/ (e^x + e^(-x))
+    T feature = (static_cast<T*>(user_i))[0];
+    T e1 = std::exp(feature);
+    T e2 = std::exp(-feature);
+    (static_cast<T*>(out_o))[0] = (e1 - e2)/(e1 + e2);
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
+ public:
+  ~MklTanhGradOp() {}
+
+  explicit MklTanhGradOp(OpKernelConstruction* context) :
+  MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    int src_dims_size = src_tensor.dims();
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    // gradient of tanh(x) = 1 - tanh(x)^2
+    T feature = (static_cast<T*>(user_i))[0];
+    T e1 = std::exp(feature);
+    T e2 = std::exp(-feature);
+    T tanh = (e1 - e2)/(e1 + e2);
+    void* user_g =
+          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
+                                  (1 - tanh * tanh);
+  }
+};
+
+#endif
+
+// register dnn kernels for supported operations and supported types
 #define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
   REGISTER_KERNEL_BUILDER(Name("_MklRelu")                          \
                               .Device(DEVICE_CPU)                   \
@@ -367,6 +798,38 @@
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
+#ifdef INTEL_MKL_DNN
+
+// register dnn kernels for supported operations and supported types
+#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklElu")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklEluOp<CPUDevice, type>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklEluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
+
+#define REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklTanh")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklTanhOp<CPUDevice, type>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklTanhGrad")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklTanhGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
+
+#endif
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
+

diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 5e98582..11c92eb 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc

@@ -28,6 +28,11 @@
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 template <typename Device, typename T>
@@ -35,6 +40,7 @@
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
+#ifndef INTEL_MKL_DNN
   void Compute(OpKernelContext* context) override {
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& sizes = MklGetInput(context, 1);
@@ -129,7 +135,183 @@
     }
   }
 
+#else
+
  private:
+  // When the input tensor is in MKL layout and we are reshaping the tensor to a
+  // different shape than its actual shape, then we use MKLDNN reorder primitive
+  // to put tensor back in Tensorflow layout. But we can skip this reordering
+  // some times. This function checks for all such cases.
+  bool SkipReorder(const MklDnnShape& mkl_shape_input,
+                   const TensorShape& reshape_to) {
+    CHECK_EQ(mkl_shape_input.IsMklTensor(), true);
+    bool ret = false;
+
+    // If Tensorflow's data format and the underlying format maintained by
+    // MKLDNN are equivalent (both are NHWC or both are NCHW), then we can
+    // safely return true.
+    auto input_mkl_md = mkl_shape_input.GetMklLayout();
+    if (mkl_shape_input.GetTfDataFormat() == input_mkl_md.data.format) {
+      ret = true;
+    }
+
+    return ret;
+  }
+
+ public:
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = MklGetInput(context, 0);
+    const Tensor& sizes = MklGetInput(context, 1);
+
+    MklDnnShape mkl_shape_input;
+    GetMklShape(context, kInputSlotIdx, &mkl_shape_input);
+    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
+    const int64 nelems = input_in_mkl_format ?
+                         mkl_shape_input.GetTfShape().num_elements()
+                         : input_tensor.NumElements();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
+                errors::InvalidArgument("sizes input must be 1-D, not shape ",
+                                        sizes.shape().DebugString()));
+
+    // Compute the output shape.  Determine product of specified
+    // dimensions, and find the index of the unspecified one.
+    TensorShape shape;
+    int64 product = 1;
+    int unknown_index = -1;
+    switch (sizes.dtype()) {
+      case DT_INT32:
+        OP_REQUIRES_OK(context, ValidateSizes<int32>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      case DT_INT64:
+        OP_REQUIRES_OK(context, ValidateSizes<int64>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      default:
+        context->CtxFailure(errors::InvalidArgument(
+            "desired shape must be a DT_INT32 or DT_INT64 vector, not a ",
+            DataTypeString(sizes.dtype())));
+        return;
+    }
+    if (unknown_index != -1) {
+      OP_REQUIRES(
+          context, product > 0,
+          errors::InvalidArgument("Reshape cannot infer the missing input size "
+                                  "for an empty tensor unless all specified "
+                                  "input sizes are non-zero"));
+      const int64 missing = nelems / product;
+      OP_REQUIRES(
+          context, product * missing == nelems,
+          errors::InvalidArgument(
+              "Input to reshape is a tensor with ", nelems,
+              " values, but the requested shape requires a multiple of ",
+              product));
+      shape.set_dim(unknown_index, missing);
+    }
+    OP_REQUIRES(context, shape.num_elements() == nelems,
+                errors::InvalidArgument("Input to reshape is a tensor with ",
+                                        nelems,
+                                        " values, but the requested shape has ",
+                                        shape.num_elements()));
+
+    if (input_in_mkl_format) {
+      TensorShape& shape_to = shape;
+      TensorShape shape_from = mkl_shape_input.GetTfShape();
+      if (shape_from == shape_to) {
+        CopyMklTensorInToOut(context, kInputSlotIdx, kOutputSlotIdx);
+        return;
+      } else {
+        try {
+          auto cpu_engine = engine(engine::cpu, 0);
+          MklDnnData<T> dnn_data_input(&cpu_engine);
+          // Reshape is just a logical view change operation for a tensor.
+          // It does not change underlying layout. But MKLDNN may maintain
+          // tensor data in different layout than that specified by Tensorflow.
+          // If MKLDNN maintains input tensor in different layout than that
+          // specified by Tensorflow, we will need to reorder tensor and then
+          // put it in the shape expected by Tensorflow. But if MKLDNN has
+          // maintained input tensor in the same layout as it is expected by
+          // Tensorflow, we don't need to reorder tensor contents, we just
+          // need to update MklDnnShape object associated with the input
+          // tensor to reflect the shape change expected by reshape.
+          if (!SkipReorder(mkl_shape_input, shape_to)) {
+              // If dimensions that are being expanded or collapsed are not
+              // maintained contiguously by MKLDNN, then we use reorder.
+
+              // Get Mkl layout of input tensor.
+              auto input_mkl_md = mkl_shape_input.GetMklLayout();
+              // Set input Mkl layout as the user layout.
+              dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
+              // Get expected Tensorflow layout of input tensor.
+              auto output_tf_md = mkl_shape_input.GetTfLayout();
+              auto output_tf_pd = memory::primitive_desc(output_tf_md,
+                                                         cpu_engine);
+
+              Tensor* output_tensor = nullptr;
+              MklShape mkl_shape_output;
+              mkl_shape_output.SetMklTensor(false);
+              // We allocate output tensor in the shape expected by Reshape.
+              AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
+                                        shape_to, mkl_shape_output);
+
+              // Insert reorder between Mkl layout and TensorFlow layout.
+              std::vector<primitive> net;
+              CHECK_EQ(dnn_data_input.CheckReorderToOpMem(output_tf_pd,
+                       output_tensor, &net), true);
+              stream(stream::kind::eager).submit(net).wait();
+              return;
+          } else {
+            // If dimensions that are being expanded or collapsed are
+            // maintained contiguously by MKLDNN, then we skip reorder, just
+            // update MklDnnShape object for the tensorflow tensor, and forward
+            // Tensorflow tensor as it is to the output.
+            auto output_dims = TFShapeToMklDnnDims(shape_to);
+            auto output_strides = CalculateTFStrides(output_dims);
+            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(output_dims,
+                                                               output_strides);
+            auto output_tf_pd = memory::primitive_desc(output_tf_md,
+                                                       cpu_engine);
+
+            // Set MklDnnShape
+            MklDnnShape mkl_shape_output;
+            mkl_shape_output.SetMklTensor(true);
+            mkl_shape_output.SetMklLayout(&output_tf_pd);
+            mkl_shape_output.SetElemType(MklDnnType<T>());
+            mkl_shape_output.SetTfLayout(output_dims.size(), output_dims,
+                                         memory::format::blocked);
+
+            // We now simply forward input Mkl tensor to output and change its
+            // output MklDnnShape object.
+            ForwardMklTensorInToOutWithMklShape(context, kInputSlotIdx,
+                                              kOutputSlotIdx, mkl_shape_output);
+            return;
+          }
+        } catch (mkldnn::error &e) {
+          string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+          OP_REQUIRES_OK(context,
+                   errors::Aborted("Operation received an exception:",
+                      error_msg));
+        }
+      }
+    } else {
+      // If input tensor is not in Mkl format, then just copy Tensorflow tensor
+      // to output with specified shape.
+      CopyTfTensorInToOutWithShape(context, kInputSlotIdx, kOutputSlotIdx,
+                                   shape);
+    }
+  }
+
+#endif  // INTEL_MKL_DNN
+
+ private:
+  const int kInputSlotIdx = 0;
+  const int kOutputSlotIdx = 0;
+
   template <typename Tshape>
   Status ValidateSizes(const Tensor& sizes, int64* product, int* unknown_index,
                        TensorShape* shape) {

diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index f83998e..1921b83 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc

@@ -268,6 +268,13 @@
     Im2ColBufferResource<T1, chunk_value_count>* im2col_buffer_resource;
     std::function<Status(Im2ColBufferResource<T1, chunk_value_count>**)>
         creator = [](Im2ColBufferResource<T1, chunk_value_count>** resource) {
+#ifdef _MSC_VER
+          // MSVC complains about the capture of chunk_value_count which oddly
+          // works fine in conv_ops_using_gemm.cc for example.
+          // Define chunk_value_count inside the lambda for now.
+          const int64 chunk_value_count =
+              (kMaxChunkSize + (sizeof(T1) - 1)) / sizeof(T1);
+#endif
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
           return Status::OK();
         };

diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index be1fa22..3c31016 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc

@@ -161,7 +161,7 @@
   }
 
   // Write length of compressed block to output buffer.
-  char* compressed_length_array = new char[4];
+  char compressed_length_array[4];
   std::fill(compressed_length_array, compressed_length_array + 4, 0);
   for (int i = 0; i < 4; i++) {
     // Little endian.
@@ -173,7 +173,6 @@
   TF_RETURN_IF_ERROR(AddToOutputBuffer(output.data(), output.size()));
   next_in_ += avail_in_;
   avail_in_ = 0;
-  delete[] compressed_length_array;
 
   return Status::OK();
 }

diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 980d0c3..15122af 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc

@@ -2958,6 +2958,25 @@
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyConv2DWithBias")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("bias: T")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+Dummy node that enables fusing Conv2D and BiasAdd operator for MKL. This node
+does not perform anything. It is just created as an intermediate output of
+merging Conv2D and BiasAdd.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DWithBias")
     .Input("input: T")
     .Input("filter: T")
@@ -3011,6 +3030,88 @@
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyConv2DBackpropFilterWithBias")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Output("bias_grad: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_shape;
+      // Fetch the data_format attribute, which may not exist.
+      string data_format;
+      Status s = c->GetAttr("data_format", &data_format);
+
+      if (s.ok() && data_format == "NCHW") {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -3)));
+      } else {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -1)));
+      }
+      ShapeHandle sh;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &sh));
+      TF_RETURN_IF_ERROR(c->WithRank(sh, 4, &sh));
+      c->set_output(0, sh);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Dummy node that enables fusing Conv2DBackpropFilter and BiasAddGrad operator
+for MKL. This node does not perform anything. It is just created as an
+intermediate output of merging Conv2DBackpropFilter and BiasAddGrad.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv2DBackpropFilterWithBias")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter_size: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("bias_grad: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_bias_grad: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_shape;
+      // Fetch the data_format attribute, which may not exist.
+      string data_format;
+      Status s = c->GetAttr("data_format", &data_format);
+
+      if (s.ok() && data_format == "NCHW") {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -3)));
+      } else {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -1)));
+      }
+      ShapeHandle sh;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &sh));
+      TF_RETURN_IF_ERROR(c->WithRank(sh, 4, &sh));
+      c->set_output(0, sh);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Conv2DBackpropFilterWithBias. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -3087,6 +3188,78 @@
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklElu")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Elu operator. Uses MKL DNN APIs to implement Elu operator.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklEluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of EluGrad operator. Uses MKL DNN APIs to compute Elu
+gradients for Elu operation.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklSoftmax")
+    .Input("logits: T")
+    .Input("mkl_logits: uint8")
+    .Output("softmax: T")
+    .Output("mkl_softmax: uint8")
+    .Attr("T: {half, float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
+    })
+    .Doc(R"doc(
+MKL version of ReluGrad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for Relu operation.
+)doc");
+
+REGISTER_OP("_MklTanh")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Tanh operator. Uses MKL DNN APIs to implement Tanh operator.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklTanhGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of TanhGrad operator. Uses MKL DNN APIs to compute tanh
+gradients for Tanh operation.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklMaxPool")
     .Attr("T: {float, half} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")

diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 624145d..aaeccc8 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD

@@ -10,6 +10,7 @@
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
+    "tf_copts",
 )
 
 filegroup(
@@ -29,6 +30,7 @@
 cc_library(
     name = "expiring_lru_cache",
     hdrs = ["expiring_lru_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = ["//tensorflow/core:lib"],
 )
@@ -37,6 +39,7 @@
     name = "file_block_cache",
     srcs = ["file_block_cache.cc"],
     hdrs = ["file_block_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = ["//tensorflow/core:lib"],
 )
@@ -45,6 +48,7 @@
     name = "gcs_dns_cache",
     srcs = ["gcs_dns_cache.cc"],
     hdrs = ["gcs_dns_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":http_request",
@@ -56,6 +60,7 @@
     name = "gcs_file_system",
     srcs = ["gcs_file_system.cc"],
     hdrs = ["gcs_file_system.h"],
+    copts = tf_copts(),
     linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     visibility = ["//visibility:public"],
     deps = [
@@ -78,6 +83,7 @@
 cc_library(
     name = "http_request",
     hdrs = ["http_request.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
@@ -89,6 +95,7 @@
     name = "curl_http_request",
     srcs = ["curl_http_request.cc"],
     hdrs = ["curl_http_request.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":http_request",
@@ -104,6 +111,7 @@
     hdrs = [
         "http_request_fake.h",
     ],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
@@ -121,6 +129,7 @@
         "auth_provider.h",
         "google_auth_provider.h",
     ],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
@@ -136,6 +145,7 @@
     name = "now_seconds_env",
     testonly = 1,
     hdrs = ["now_seconds_env.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:lib",
@@ -151,6 +161,7 @@
     hdrs = [
         "oauth_client.h",
     ],
+    copts = tf_copts(),
     deps = [
         ":curl_http_request",
         ":http_request",
@@ -169,6 +180,7 @@
     hdrs = [
         "retrying_utils.h",
     ],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
@@ -183,6 +195,7 @@
     hdrs = [
         "retrying_file_system.h",
     ],
+    copts = tf_copts(),
     deps = [
         ":retrying_utils",
         "//tensorflow/core:framework_headers_lib",
@@ -198,6 +211,7 @@
     hdrs = [
         "time_util.h",
     ],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",

diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
index 78bf680..87b0dde 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc

@@ -14,9 +14,14 @@
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
-
+#ifndef _WIN32
 #include <arpa/inet.h>
 #include <netdb.h>
+#else
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#include <Windows.h>
+#endif
 #include <sys/types.h>
 
 namespace tensorflow {
@@ -26,6 +31,21 @@
 const std::vector<string>& kCachedDomainNames =
     *new std::vector<string>{"www.googleapis.com", "storage.googleapis.com"};
 
+inline void print_getaddrinfo_error(const string& name, int error_code) {
+#ifndef _WIN32
+  if (error_code == EAI_SYSTEM) {
+    LOG(ERROR) << "Error resolving " << name
+               << " (EAI_SYSTEM): " << strerror(errno);
+  } else {
+    LOG(ERROR) << "Error resolving " << name << ": "
+               << gai_strerror(error_code);
+  }
+#else
+  // TODO:WSAGetLastError is better than gai_strerror
+  LOG(ERROR) << "Error resolving " << name << ": " << gai_strerror(error_code);
+#endif
+}
+
 // Selects one item at random from a vector of items, using a uniform
 // distribution.
 template <typename T>
@@ -86,7 +106,7 @@
 
   std::vector<string> output;
   if (return_code == 0) {
-    for (addrinfo* i = result; i != nullptr; i = i->ai_next) {
+    for (const addrinfo* i = result; i != nullptr; i = i->ai_next) {
       if (i->ai_family != AF_INET || i->ai_addr->sa_family != AF_INET) {
         LOG(WARNING) << "Non-IPv4 address returned. ai_family: " << i->ai_family
                      << ". sa_family: " << i->ai_addr->sa_family << ".";
@@ -106,13 +126,7 @@
       }
     }
   } else {
-    if (return_code == EAI_SYSTEM) {
-      LOG(ERROR) << "Error resolving " << name
-                 << " (EAI_SYSTEM): " << strerror(errno);
-    } else {
-      LOG(ERROR) << "Error resolving " << name << ": "
-                 << gai_strerror(return_code);
-    }
+    print_getaddrinfo_error(name, return_code);
   }
   if (result != nullptr) {
     freeaddrinfo(result);

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index f80cbf7..a183fe6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc

@@ -22,6 +22,9 @@
 #include <cstring>
 #include <fstream>
 #include <vector>
+#ifdef _WIN32
+#include <io.h>  //for _mktemp
+#endif
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -40,6 +43,12 @@
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
+#ifdef _WIN32
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
+#endif
+
 namespace tensorflow {
 
 namespace {
@@ -109,16 +118,25 @@
 // upload requests.
 constexpr char kWriteRequestTimeout[] = "GCS_WRITE_REQUEST_TIMEOUT_SECS";
 
+// TODO: DO NOT use a hardcoded path
 Status GetTmpFilename(string* filename) {
   if (!filename) {
     return errors::Internal("'filename' cannot be nullptr.");
   }
+#ifndef _WIN32
   char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
   int fd = mkstemp(buffer);
   if (fd < 0) {
     return errors::Internal("Failed to create a temporary file.");
   }
   close(fd);
+#else
+  char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
+  char* ret = _mktemp(buffer);
+  if (ret == nullptr) {
+    return errors::Internal("Failed to create a temporary file.");
+  }
+#endif
   *filename = buffer;
   return Status::OK();
 }
@@ -306,6 +324,7 @@
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
         initial_retry_delay_usec_(initial_retry_delay_usec) {
+    // TODO: to make it safer, outfile_ should be constructed from an FD
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
       outfile_.open(tmp_content_filename_,
                     std::ofstream::binary | std::ofstream::app);
@@ -429,7 +448,7 @@
       return errors::Internal("'size' cannot be nullptr");
     }
     const auto tellp = outfile_.tellp();
-    if (tellp == -1) {
+    if (tellp == static_cast<std::streampos>(-1)) {
       return errors::Internal(
           "Could not get the size of the internal temporary file.");
     }

diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index f6fd837..d77f439 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc

@@ -14,9 +14,12 @@
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
+#ifndef _WIN32
 #include <pwd.h>
-#include <sys/types.h>
 #include <unistd.h>
+#else
+#include <sys/types.h>
+#endif
 #include <fstream>
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"

diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index c700b97..3c2830c 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc

@@ -14,9 +14,13 @@
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/oauth_client.h"
+#ifndef _WIN32
 #include <pwd.h>
 #include <sys/types.h>
 #include <unistd.h>
+#else
+#include <sys/types.h>
+#endif
 #include <fstream>
 #include <openssl/bio.h>
 #include <openssl/evp.h>

diff --git a/tensorflow/core/platform/cloud/time_util.cc b/tensorflow/core/platform/cloud/time_util.cc
index 2f8643f..0587a65 100644
--- a/tensorflow/core/platform/cloud/time_util.cc
+++ b/tensorflow/core/platform/cloud/time_util.cc

@@ -18,6 +18,9 @@
 #include <cmath>
 #include <cstdio>
 #include <ctime>
+#ifdef _WIN32
+#define timegm _mkgmtime
+#endif
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {

diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 0f8cf8f..948334d 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl

@@ -458,7 +458,6 @@
 
 def tf_additional_core_deps():
   return select({
-      "//tensorflow:with_gcp_support_windows_override": [],
       "//tensorflow:with_gcp_support_android_override": [],
       "//tensorflow:with_gcp_support_ios_override": [],
       "//tensorflow:with_gcp_support": [

diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index fb1955e..12dc9c5 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc

@@ -118,9 +118,10 @@
   const int retval = fscanf(fp, "%lld", &freq_in_khz);
   if (retval < 0) {
     LOG(WARNING) << "Failed to \"" << file_path << "\"";
+    fclose(fp);
     return INVALID_CPU_FREQUENCY;
   }
-  pclose(fp);
+  fclose(fp);
   return freq_in_khz * 1000;  // The file contains cpu frequency in khz
 }
 

diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 234f3c3..682ad97 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc

@@ -12,9 +12,9 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
 
 #include <aws/core/Aws.h>
@@ -49,9 +49,15 @@
     if (endpoint) {
       cfg.endpointOverride = Aws::String(endpoint);
     }
-    const char* region = getenv("S3_REGION");
+    const char* region = getenv("AWS_REGION");
     if (region) {
       cfg.region = Aws::String(region);
+    } else {
+      // TODO (yongtang): `S3_REGION` should be deprecated after 2.0.
+      const char* region = getenv("S3_REGION");
+      if (region) {
+        cfg.region = Aws::String(region);
+      }
     }
     const char* use_https = getenv("S3_USE_HTTPS");
     if (use_https) {

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 148c785..2caf5fc 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h

@@ -328,6 +328,10 @@
 
 // Forward decl
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
+memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
+memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
+                                        const memory::dims& strides,
+                                        memory::data_type dtype);
 
 class MklDnnShape {
  private:
@@ -364,6 +368,52 @@
   ~MklDnnShape() {}
   TF_DISALLOW_COPY_AND_ASSIGN(MklDnnShape);  // Cannot copy
 
+  /// Helper function to compare memory::desc objects for MklDnn.
+  /// May be this should go into MklDnn directly.
+  inline bool CompareMklDnnLayouts(const memory::desc& md1,
+                                   const memory::desc& md2) const {
+    mkldnn_memory_desc_t mdd1 = md1.data;
+    mkldnn_memory_desc_t mdd2 = md2.data;
+    const char* d1 = reinterpret_cast<const char*>(&mdd1);
+    const char* d2 = reinterpret_cast<const char*>(&mdd2);
+
+    size_t md_size = sizeof(mdd1);
+    for (size_t i = 0; i < md_size; i++) {
+      if (*d1++ != *d2++) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Equality function for MklDnnShape objects
+  /// @return true if both are equal; false otherwise.
+  inline bool operator == (const MklDnnShape& input_shape) const {
+    if (this->IsMklTensor() != input_shape.IsMklTensor()) {
+      return false;
+    }
+
+    // If input tensors are in Mkl layout, then we check for dimensions and
+    // sizes.
+    if (this->IsMklTensor()) {
+      return this->GetTfShape() == input_shape.GetTfShape() &&
+             CompareMklDnnLayouts(this->GetMklLayout(),
+                                  input_shape.GetMklLayout());
+    }
+
+    return true;
+  }
+
+  /// Equality operator for MklDnnShape and TFShape.
+  /// Returns: true if TF shapes for both are the same, false otherwise
+  inline bool operator == (const TensorShape& input_shape) const {
+    if (!this->IsMklTensor()) {
+      return false;
+    }
+
+    return this->GetTfShape() == input_shape;
+  }
+
   inline const bool IsMklTensor() const { return data_.is_mkl_tensor_; }
   inline void SetMklTensor(bool is_mkl_tensor) {
     data_.is_mkl_tensor_ = is_mkl_tensor;
@@ -375,7 +425,7 @@
   inline size_t GetDimension(char dimension) const {
     int index = GetMklDnnTensorDimIndex(dimension);
     CHECK(index >= 0 && index < this->GetDimension())
-        << "Invalid index from the dimension: " << index << ", " << dimension;
+      << "Invalid index from the dimension: " << index << ", " << dimension;
     return this->DimSize(index);
   }
 
@@ -405,7 +455,7 @@
   inline memory::dims GetSizesAsMklDnnDims() const {
     memory::dims retVal;
     if (data_.is_mkl_tensor_) {
-      int dimensions = sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
+      size_t dimensions = sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
       for (size_t i = 0; i < dimensions; i++) {
         if (data_.sizes_[i] != INVALID_DIM_SIZE)
           retVal.push_back(data_.sizes_[i]);
@@ -423,12 +473,21 @@
 
   /// Return TensorShape that describes the Tensorflow shape of the tensor
   /// represented by this MklShape.
-  inline TensorShape GetTfShape() {
+  inline TensorShape GetTfShape() const {
     CHECK_EQ(data_.is_mkl_tensor_, true);
 
     std::vector<int32> shape(data_.dimension_, -1);
-    for (size_t idx = 0; idx < data_.dimension_; ++idx) {
-      shape[idx] = data_.sizes_[TfDimIdx(idx)];
+    if (data_.tf_data_format_ != memory::format::blocked) {
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[TfDimIdx(idx)];
+      }
+    } else {
+      // If Tensorflow shape is in Blocked format, then we don't have dimension
+      // map for it. So we just create Tensorflow shape from sizes in the
+      // specified order.
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[idx];
+      }
     }
 
     TensorShape ts;
@@ -444,6 +503,12 @@
     CHECK_NOTNULL(pd);
     data_.mkl_md_ = pd->desc().data;
   }
+
+  inline void SetMklLayout(memory::desc* md) {
+    CHECK_NOTNULL(md);
+    data_.mkl_md_ = md->data;
+  }
+
   inline const memory::desc GetMklLayout() const {
     return memory::desc(data_.mkl_md_);
   }
@@ -452,7 +517,8 @@
     return data_.tf_data_format_;
   }
   /// We don't create primitive_descriptor for TensorFlow layout now.
-  /// We use lazy evaluation and create it only when needed.
+  /// We use lazy evaluation and create it only when needed. Input format can
+  /// also be Blocked format.
   inline void SetTfLayout(size_t dims, const memory::dims& sizes,
                           memory::format format) {
     CHECK_EQ(dims, sizes.size());
@@ -461,15 +527,26 @@
       data_.sizes_[ii] = sizes[ii];
     }
     data_.tf_data_format_ = format;
-    SetTfDimOrder(dims, format);
+    if (format != memory::format::blocked) {
+      SetTfDimOrder(dims, format);
+    }
   }
+
   inline const memory::desc GetTfLayout() const {
     memory::dims dims;
     for (size_t ii = 0; ii < data_.dimension_; ii++) {
       dims.push_back(data_.sizes_[ii]);
     }
-    return memory::desc(dims, data_.T_, data_.tf_data_format_);
+
+    // Create Blocked memory desc if input TF format was set like that.
+    if (data_.tf_data_format_ == memory::format::blocked) {
+      auto strides = CalculateTFStrides(dims);
+      return CreateBlockedMemDescHelper(dims, strides, data_.T_);
+    } else {
+      return memory::desc(dims, data_.T_, data_.tf_data_format_);
+    }
   }
+
   inline const memory::desc GetCurLayout() const {
     return IsMklTensor() ? GetMklLayout() : GetTfLayout();
   }
@@ -579,8 +656,13 @@
 #endif
 
 // List of MklShape objects. Used in Concat/Split layers.
+
 typedef std::vector<MklShape> MklShapeList;
 
+#ifdef INTEL_MKL_DNN
+typedef std::vector<MklDnnShape> MklDnnShapeList;
+#endif
+
 // Check if all tensors specified by MklShapes are MKL tensors.
 inline bool AreAllMklTensors(const MklShapeList& shapes) {
   for (auto& s : shapes) {
@@ -591,6 +673,7 @@
   return true;
 }
 
+#ifndef INTEL_MKL_DNN
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklShape& mkl_shape) {
@@ -615,32 +698,15 @@
 
   return output_tensor;
 }
-
-#ifdef INTEL_MKL_DNN
+#else
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
   TensorShape output_shape;
 
-#if 0
-  // TODO(nhasabni): need to implement
-  for (size_t j = 0; j < mkl_shape.GetDimension(); j++) {
-    // Outermost to innermost dimension
-    output_shape.AddDim(mkl_shape.GetSizes()[mkl_shape.tf_dim_idx(j)]);
-  }
-
-  // Allocate output tensor.
-  context->allocate_temp(DataTypeToEnum<T>::v(), output_shape, &output_tensor);
-
-  dnnLayout_t output_layout = static_cast<dnnLayout_t>(mkl_shape.GetTfLayout());
-  void* input_buffer = const_cast<T*>(mkl_tensor.flat<T>().data());
-  void* output_buffer = const_cast<T*>(output_tensor.flat<T>().data());
-
-  if (mkl_tensor.NumElements() != 0) {
-    mkl_shape.GetConvertedFlatData(output_layout, input_buffer, output_buffer);
-  }
-#endif
+  TF_CHECK_OK(Status(error::Code::UNIMPLEMENTED,
+                     "Unimplemented conversion function"));
 
   return output_tensor;
 }
@@ -682,6 +748,9 @@
   ctext->input_list(name, input_tensors);
 }
 
+
+#ifndef INTEL_MKL_DNN
+
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
                             MklShapeList* mkl_shapes) {
   OpInputList input_mkl_tensors;
@@ -694,6 +763,22 @@
   }
 }
 
+#else
+
+inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
+                            MklDnnShapeList* mkl_shapes) {
+  OpInputList input_mkl_tensors;
+  GetMklInputList(ctext, strings::StrCat("mkl_", name), &input_mkl_tensors);
+
+  for (int i = 0; i < input_mkl_tensors.size(); i++) {
+    (*mkl_shapes)[i].DeSerializeMklDnnShape(
+        input_mkl_tensors[i].flat<uint8>().data(),
+        input_mkl_tensors[i].flat<uint8>().size() * sizeof(uint8));
+  }
+}
+
+#endif
+
 #ifdef INTEL_MKL_DNN
 /// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
 /// If the input tensor is in MKL layout, then obtains TensorShape from
@@ -909,6 +994,7 @@
   context->set_output(idx_meta_out, meta_output);
 }
 
+#ifndef INTEL_MKL_DNN
 inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
                                          int idx_in, int idx_out,
                                          const TensorShape& shape) {
@@ -926,6 +1012,27 @@
   CHECK(output.CopyFrom(data, shape));
   context->set_output(idx_data_out, output);
 }
+#else
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
+                                         int idx_in, int idx_out,
+                                         const TensorShape& shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  const Tensor& data = context->input(idx_data_in);
+  MklDnnShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  Tensor output(data.dtype());
+  // TODO(intel_tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, shape));
+  context->set_output(idx_data_out, output);
+}
+#endif
+
+#ifndef INTEL_MKL_DNN
 
 inline void ForwardTfTensorInToOut(OpKernelContext* context,
                                   int idx_in, int idx_out) {
@@ -944,6 +1051,27 @@
   }
 }
 
+#else
+
+inline void ForwardTfTensorInToOut(OpKernelContext* context,
+                                  int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  MklDnnShape dnn_shape_output;
+  dnn_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, dnn_shape_output);
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+
+#endif
+
 inline void ForwardMklTensorInToOut(OpKernelContext* context,
                                    int idx_in, int idx_out) {
   int num_inputs = context->num_inputs();
@@ -962,6 +1090,25 @@
   }
 }
 
+#ifdef INTEL_MKL_DNN
+inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
+                                             int idx_in, int idx_out,
+                                             const MklDnnShape& mkl_shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape);
+
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+#endif
+
 // Forward the MKL shape ONLY (used in elementwise and other ops where
 // we call the eigen implementation and MKL shape is not used)
 inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
@@ -985,6 +1132,10 @@
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
 
+#ifndef INTEL_MKL_DNN
+// We don't need these functions in MKLDNN. We have defined equality operator
+// on MklDnnShape class directly.
+
 // Checks if the TF shape for both MKL tensors is the same or not
 // Returns: true if both TF shapes are the same, false otherwise
 inline bool MklCompareShapes(const MklShape* input_shape_0,
@@ -1051,6 +1202,7 @@
 
   return true;
 }
+#endif
 
 // These functions do not compile with MKL-DNN since mkl.h is missing.
 // We may need to remove them later.
@@ -1127,11 +1279,14 @@
 /// @return: Tensorflow data format corresponding to memory::format
 ///          Fails with an error if invalid data format.
 inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
-  if (format == memory::format::nhwc)
-    return FORMAT_NHWC;
-  else if (format == memory::format::nchw)
-    return FORMAT_NCHW;
-  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
+  if (format == memory::format::nhwc) return FORMAT_NHWC;
+  else if (format == memory::format::nchw) return FORMAT_NCHW;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT,
+                     "Unsupported data format"));
+
+  // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure
+  // that we don't come here.
+  return FORMAT_NHWC;
 }
 
 /// Map TensorShape object into memory::dims required by MKL-DNN
@@ -1175,6 +1330,23 @@
   return memory::dims({n, c, h, w});
 }
 
+/// Overloaded version of function above. Input parameters are
+/// self-explanatory.
+inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
+                                     TensorFormat format) {
+  // Check validity of format.
+  CHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+           memory::format::format_undef);
+
+  int n = in_dims[GetTensorDimIndex(format, 'N')];
+  int c = in_dims[GetTensorDimIndex(format, 'C')];
+  int h = in_dims[GetTensorDimIndex(format, 'H')];
+  int w = in_dims[GetTensorDimIndex(format, 'W')];
+
+  // MKL-DNN requires dimensions in NCHW format.
+  return memory::dims({n, c, h, w});
+}
+
 /// Map MklDnn memory::dims object into TensorShape object.
 ///
 /// This function will simply map input shape in MKL-DNN memory::dims format
@@ -1217,6 +1389,43 @@
   return padding_kind::zero;
 }
 
+/// Helper function to create memory descriptor in Blocked format
+///
+/// @input: Tensor dimensions
+/// @input: strides corresponding to dimensions. One can use utility
+///         function such as CalculateTFStrides to compute strides
+///         for given dimensions.
+/// @return: memory::desc object corresponding to blocked memory format
+///          for given dimensions and strides.
+inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
+                                               const memory::dims& strides,
+                                               memory::data_type dtype) {
+  CHECK_EQ(dim.size(), strides.size());
+
+  // We have to construct memory descriptor in a C style. This is not at all
+  // ideal but MKLDNN does not offer any API to construct descriptor in
+  // blocked format except a copy constructor that accepts
+  // mkldnn_memory_desc_t.
+  mkldnn_memory_desc_t md;
+  md.primitive_kind = mkldnn_memory;
+  md.ndims = dim.size();
+  md.format = mkldnn_blocked;
+  md.data_type = memory::convert_to_c(dtype);
+
+  for (size_t i = 0; i < dim.size(); i++) {
+    md.layout_desc.blocking.block_dims[i] = 1;
+    md.layout_desc.blocking.strides[1][i] = 1;
+    md.layout_desc.blocking.strides[0][i] = strides[i];
+    md.layout_desc.blocking.padding_dims[i] = dim[i];
+    md.layout_desc.blocking.offset_padding_to_data[i] = 0;
+    md.dims[i] = dim[i];
+  }
+  md.layout_desc.blocking.offset_padding = 0;
+
+  return memory::desc(md);
+}
+
+
 /*
  * Class to represent all the resources corresponding to a tensor in TensorFlow
  * that are required to execute an operation (such as Convolution).
@@ -1285,30 +1494,8 @@
   /// @return: memory::desc object corresponding to blocked memory format
   ///          for given dimensions and strides.
   static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim,
-                                                  const memory::dims& strides) {
-    CHECK_EQ(dim.size(), strides.size());
-
-    // We have to construct memory descriptor in a C style. This is not at all
-    // ideal but MKLDNN does not offer any API to construct descriptor in
-    // blocked format except a copy constructor that accepts
-    // mkldnn_memory_desc_t.
-    mkldnn_memory_desc_t md;
-    md.primitive_kind = mkldnn_memory;
-    md.ndims = dim.size();
-    md.format = mkldnn_blocked;
-    md.data_type = memory::convert_to_c(MklDnnType<T>());
-
-    for (size_t i = 0; i < dim.size(); i++) {
-      md.layout_desc.blocking.block_dims[i] = 1;
-      md.layout_desc.blocking.strides[1][i] = 1;
-      md.layout_desc.blocking.strides[0][i] = strides[i];
-      md.layout_desc.blocking.padding_dims[i] = dim[i];
-      md.layout_desc.blocking.offset_padding_to_data[i] = 0;
-      md.dims[i] = dim[i];
-    }
-    md.layout_desc.blocking.offset_padding = 0;
-
-    return memory::desc(md);
+                                                 const memory::dims& strides) {
+    return CreateBlockedMemDescHelper(dim, strides, MklDnnType<T>());
   }
 
   /// A version of SetUsrMem call that allows user to create memory in blocked
@@ -1376,6 +1563,7 @@
     return user_memory_->get_primitive_desc();
   }
 
+
   /// Get function for descriptor of user memory.
   inline memory::desc GetUsrMemDesc() {
     // This is ugly. Why MKL-DNN does not provide desc() method of const type??
@@ -1438,6 +1626,17 @@
     return op_pd != user_memory_->get_primitive_desc();
   }
 
+  /// Predicate that checks if we need to reorder user's memory into memory
+  /// based on the provided format.
+  ///
+  /// @input: target_format - memory format of the given input of an
+  ///               operation
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool IsReorderNeeded(const memory::format& target_format) const {
+    CHECK_NOTNULL(user_memory_);
+    return target_format != user_memory_->get_primitive_desc().desc().data.format;
+  }
+
   /// Function to create a reorder from memory pointed by from to memory pointed
   /// by to. Returns created primitive.
   inline primitive CreateReorder(const memory* from, const memory* to) const {

diff --git a/tensorflow/docs_src/api_guides/python/image.md b/tensorflow/docs_src/api_guides/python/image.md
index a2c8c3c..051e454 100644
--- a/tensorflow/docs_src/api_guides/python/image.md
+++ b/tensorflow/docs_src/api_guides/python/image.md

@@ -19,6 +19,7 @@
 presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
 to be stripped from the image and re-attached using slicing ops.
 
+*   @{tf.image.decode_bmp}
 *   @{tf.image.decode_gif}
 *   @{tf.image.decode_jpeg}
 *   @{tf.image.encode_jpeg}

diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md
index 4594887..f316cce 100644
--- a/tensorflow/docs_src/api_guides/python/reading_data.md
+++ b/tensorflow/docs_src/api_guides/python/reading_data.md

@@ -175,14 +175,25 @@
 [`tensorflow/examples/how_tos/reading_data/convert_to_records.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/convert_to_records.py)
 converts MNIST data to this format.
 
-To read a file of TFRecords, use
-@{tf.TFRecordReader} with
-the @{tf.parse_single_example}
-decoder. The `parse_single_example` op decodes the example protocol buffers into
-tensors. An MNIST example using the data produced by `convert_to_records` can be
-found in
-[`tensorflow/examples/how_tos/reading_data/fully_connected_reader.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py),
-which you can compare with the `fully_connected_feed` version.
+The recommended way to read a TFRecord file is with a @{tf.data.TFRecordDataset}, [as in this example](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py):
+
+``` python
+    dataset = tf.data.TFRecordDataset(filename)
+    dataset = dataset.repeat(num_epochs)
+
+    # map takes a python function and applies it to every sample
+    dataset = dataset.map(decode)
+```
+
+To acomplish the same task with a queue based input pipeline requires the following code 
+(using the same `decode` function from the above example): 
+
+``` python
+  filename_queue = tf.train.string_input_producer([filename], num_epochs=num_epochs)
+  reader = tf.TFRecordReader()
+  _, serialized_example = reader.read(filename_queue)
+  image,label = decode(serialized_example)
+```
 
 ### Preprocessing
 

diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
index 71eee42..dac0049 100644
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md

@@ -47,7 +47,7 @@
 instances.
 
 ```python
-data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
+data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
 ```
 
 **NOTE**: The `fake_data` flag is used for unit-testing purposes and may be
@@ -364,7 +364,7 @@
 contain both the graph itself and the values of the summaries.
 
 ```python
-summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
+summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)
 ```
 
 Lastly, the events file will be updated with new summary values every time the
@@ -398,7 +398,7 @@
 directory with the current values of all the trainable variables.
 
 ```python
-saver.save(sess, FLAGS.train_dir, global_step=step)
+saver.save(sess, checkpoint_file, global_step=step)
 ```
 
 At some later point in the future, training might be resumed by using the
@@ -406,7 +406,7 @@
 method to reload the model parameters.
 
 ```python
-saver.restore(sess, FLAGS.train_dir)
+saver.restore(sess, checkpoint_file)
 ```
 
 ## Evaluate the Model

diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index 6544a16..8b6cbbc 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md

@@ -187,7 +187,7 @@
 Note that the names of feature columns and labels of a keras estimator come from
 the corresponding compiled keras model. For example, the input key names for
 @{$get_started/input_fn} in above `est_inception_v3` estimator can be obtained
-from `keras_inception_v3.input_names`, and similarily, the predicted output
+from `keras_inception_v3.input_names`, and similarly, the predicted output
 names can be obtained from `keras_inception_v3.output_names`.
 
 For more details, please refer to the documentation for

diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index 16753c9..bac385c 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md

@@ -205,7 +205,7 @@
 v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())
 assignment = v.assign_add(1)
 tf.global_variables_initializer().run()
-assignment.run()
+sess.run(assignment)  # or assignment.op.run()
 ```
 
 Most TensorFlow optimizers have specialized ops that efficiently update the

diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 48f566f..f7bdf8b 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle

@@ -28,8 +28,8 @@
     }
 
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.0'
-        classpath 'org.apache.httpcomponents:httpclient:4.5.2'
+        classpath 'com.android.tools.build:gradle:3.0.1'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.4'
     }
 }
 
@@ -75,7 +75,7 @@
 
 android {
     compileSdkVersion 23
-    buildToolsVersion "25.0.2"
+    buildToolsVersion '26.0.2'
 
     if (nativeBuildSystem == 'cmake') {
         defaultConfig {

diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..13372ae
--- /dev/null
+++ b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar
Binary files differ

diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..bd9ee87
--- /dev/null
+++ b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties

@@ -0,0 +1,6 @@
+#Sat Nov 18 15:06:47 CET 2017
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-all.zip

diff --git a/tensorflow/examples/android/gradlew b/tensorflow/examples/android/gradlew
new file mode 100644
index 0000000..9d82f78
--- /dev/null
+++ b/tensorflow/examples/android/gradlew

@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"

diff --git a/tensorflow/examples/android/gradlew.bat b/tensorflow/examples/android/gradlew.bat
new file mode 100644
index 0000000..8a0b282
--- /dev/null
+++ b/tensorflow/examples/android/gradlew.bat

@@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega

diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index 4e45f42d..8bd4abb 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java

@@ -333,8 +333,12 @@
           continue;
         }
 
-        useCamera2API = isHardwareLevelSupported(characteristics,
-            CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
+        // Fallback to camera1 API for internal cameras that don't have full support.
+        // This should help with legacy situations where using the camera2 API causes
+        // distorted or otherwise broken previews.
+        useCamera2API = (facing == CameraCharacteristics.LENS_FACING_EXTERNAL)
+            || isHardwareLevelSupported(characteristics, 
+                                        CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
         LOGGER.i("Camera API lv2?: %s", useCamera2API);
         return cameraId;
       }

diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index a9ed02d..9db8835 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py

@@ -45,9 +45,7 @@
 VALIDATION_FILE = 'validation.tfrecords'
 
 
-def read_and_decode(filename_queue):
-  reader = tf.TFRecordReader()
-  _, serialized_example = reader.read(filename_queue)
+def decode(serialized_example):
   features = tf.parse_single_example(
       serialized_example,
       # Defaults are not specified since both keys are required.
@@ -60,22 +58,26 @@
   # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
   # [mnist.IMAGE_PIXELS].
   image = tf.decode_raw(features['image_raw'], tf.uint8)
-  image.set_shape([mnist.IMAGE_PIXELS])
+  image.set_shape((mnist.IMAGE_PIXELS))
 
+  # Convert label from a scalar uint8 tensor to an int32 scalar.
+  label = tf.cast(features['label'], tf.int32)
+  
+  return image, label
+
+def augment(image, label):
   # OPTIONAL: Could reshape into a 28x28 image and apply distortions
   # here.  Since we are not applying any distortions in this
   # example, and the next step expects the image to be flattened
   # into a vector, we don't bother.
+  return image, label
 
+def normalize(image, label):
   # Convert from [0, 255] -> [-0.5, 0.5] floats.
   image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
 
-  # Convert label from a scalar uint8 tensor to an int32 scalar.
-  label = tf.cast(features['label'], tf.int32)
-
   return image, label
 
-
 def inputs(train, batch_size, num_epochs):
   """Reads input data num_epochs times.
 
@@ -91,31 +93,32 @@
       in the range [-0.5, 0.5].
     * labels is an int32 tensor with shape [batch_size] with the true label,
       a number in the range [0, mnist.NUM_CLASSES).
-    Note that an tf.train.QueueRunner is added to the graph, which
-    must be run using e.g. tf.train.start_queue_runners().
+
+    This function creates a one_shot_iterator, meaning that it will only iterate
+    over the dataset once. On the other hand there is no special initialization
+    required.
   """
   if not num_epochs: num_epochs = None
   filename = os.path.join(FLAGS.train_dir,
                           TRAIN_FILE if train else VALIDATION_FILE)
 
   with tf.name_scope('input'):
-    filename_queue = tf.train.string_input_producer(
-        [filename], num_epochs=num_epochs)
+    # TFRecordDataset opens a protobuf and reads entries line by line
+    # could also be [list, of, filenames]
+    dataset = tf.data.TFRecordDataset(filename)
+    dataset = dataset.repeat(num_epochs)
 
-    # Even when reading in multiple threads, share the filename
-    # queue.
-    image, label = read_and_decode(filename_queue)
+    # map takes a python function and applies it to every sample
+    dataset = dataset.map(decode)
+    dataset = dataset.map(augment)
+    dataset = dataset.map(normalize)
 
-    # Shuffle the examples and collect them into batch_size batches.
-    # (Internally uses a RandomShuffleQueue.)
-    # We run this in two threads to avoid being a bottleneck.
-    images, sparse_labels = tf.train.shuffle_batch(
-        [image, label], batch_size=batch_size, num_threads=2,
-        capacity=1000 + 3 * batch_size,
-        # Ensures a minimum amount of shuffling of examples.
-        min_after_dequeue=1000)
+    #the parameter is the queue size
+    dataset = dataset.shuffle(1000 + 3 * batch_size)
+    dataset = dataset.batch(batch_size)
 
-    return images, sparse_labels
+    iterator = dataset.make_one_shot_iterator()
+  return iterator.get_next()
 
 
 def run_training():
@@ -124,16 +127,16 @@
   # Tell TensorFlow that the model will be built into the default Graph.
   with tf.Graph().as_default():
     # Input images and labels.
-    images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
-                            num_epochs=FLAGS.num_epochs)
+    image_batch, label_batch = inputs(train=True, batch_size=FLAGS.batch_size,
+                               num_epochs=FLAGS.num_epochs)
 
     # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(images,
+    logits = mnist.inference(image_batch,
                              FLAGS.hidden1,
                              FLAGS.hidden2)
 
     # Add to the Graph the loss calculation.
-    loss = mnist.loss(logits, labels)
+    loss = mnist.loss(logits, label_batch)
 
     # Add to the Graph operations that train the model.
     train_op = mnist.training(loss, FLAGS.learning_rate)
@@ -143,47 +146,33 @@
                        tf.local_variables_initializer())
 
     # Create a session for running operations in the Graph.
-    sess = tf.Session()
+    with tf.Session() as sess:
+      # Initialize the variables (the trained variables and the
+      # epoch counter).
+      sess.run(init_op)
+      try:
+        step = 0
+        while True: #train until OutOfRangeError
+          start_time = time.time()
 
-    # Initialize the variables (the trained variables and the
-    # epoch counter).
-    sess.run(init_op)
+          # Run one step of the model.  The return values are
+          # the activations from the `train_op` (which is
+          # discarded) and the `loss` op.  To inspect the values
+          # of your ops or variables, you may include them in
+          # the list passed to sess.run() and the value tensors
+          # will be returned in the tuple from the call.
+          _, loss_value = sess.run([train_op, loss])
 
-    # Start input enqueue threads.
-    coord = tf.train.Coordinator()
-    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
+          duration = time.time() - start_time
 
-    try:
-      step = 0
-      while not coord.should_stop():
-        start_time = time.time()
-
-        # Run one step of the model.  The return values are
-        # the activations from the `train_op` (which is
-        # discarded) and the `loss` op.  To inspect the values
-        # of your ops or variables, you may include them in
-        # the list passed to sess.run() and the value tensors
-        # will be returned in the tuple from the call.
-        _, loss_value = sess.run([train_op, loss])
-
-        duration = time.time() - start_time
-
-        # Print an overview fairly often.
-        if step % 100 == 0:
-          print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
+          # Print an overview fairly often.
+          if step % 100 == 0:
+            print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
                                                      duration))
-        step += 1
-    except tf.errors.OutOfRangeError:
-      print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
-    finally:
-      # When done, ask the threads to stop.
-      coord.request_stop()
-
-    # Wait for threads to finish.
-    coord.join(threads)
-    sess.close()
-
-
+          step += 1
+      except tf.errors.OutOfRangeError:
+        print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
+      
 def main(_):
   run_training()
 

diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
index 1e375ed..4a42983 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc

@@ -53,7 +53,8 @@
   //  - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
   //  - Reshapes the tensor so that it's [height, width, 1] for imaging.
   //  - Encodes it as a PNG stream and saves it out to a file.
-  Output file_reader = ReadFile(root.WithOpName("input_wav"), input_wav);
+  Output file_reader =
+      tensorflow::ops::ReadFile(root.WithOpName("input_wav"), input_wav);
   DecodeWav wav_decoder =
       DecodeWav(root.WithOpName("wav_decoder"), file_reader);
   Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
@@ -71,8 +72,8 @@
   Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
                            Squeeze::Attrs().Axis({0}));
   Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
-  WriteFile file_writer =
-      WriteFile(root.WithOpName("output_image"), output_image, png_encoder);
+  tensorflow::ops::WriteFile file_writer = tensorflow::ops::WriteFile(
+      root.WithOpName("output_image"), output_image, png_encoder);
   tensorflow::GraphDef graph;
   TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
 

diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index f200a8e..fc087d9 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go

@@ -28,7 +28,8 @@
 //                                 int num_shapes) {
 //  const int64_t** dims =
 //    (const int64_t**)malloc(sizeof(const int64_t*) * num_shapes);
-//  for (int i = 0; i < num_shapes; i++) {
+//  int i = 0;
+//  for (i = 0; i < num_shapes; i++) {
 //    dims[i] = flat_dims;
 //    if (num_dims[i] > 0) {
 //      // flat_dims will be NULL iff num_shapes is 0 or all elements in num_dims are <= 0.
@@ -132,6 +133,20 @@
 	return &Operation{cop, g}
 }
 
+// Operations returns a list of all operations in the graph
+func (g *Graph) Operations() []Operation {
+	var pos C.size_t = 0
+	ops := []Operation{}
+	for {
+		cop := C.TF_GraphNextOperation(g.c, &pos)
+		if cop == nil {
+			break
+		}
+		ops = append(ops, Operation{cop, g})
+	}
+	return ops
+}
+
 // OpSpec is the specification of an Operation to be added to a Graph
 // (using Graph.AddOperation).
 type OpSpec struct {

diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index c3120bc..b8d65c5 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go

@@ -29,10 +29,26 @@
 			missing = append(missing, op)
 		}
 	}
-	if len(missing) == 0 {
-		return nil
+	if len(missing) != 0 {
+		return fmt.Errorf("Graph does not have the operations %v", missing)
 	}
-	return fmt.Errorf("Graph does not have the operations %v", missing)
+
+	inList := map[string]bool{}
+	for _, op := range g.Operations() {
+		inList[op.Name()] = true
+	}
+
+	for _, op := range ops {
+		if !inList[op] {
+			missing = append(missing, op)
+		}
+	}
+
+	if len(missing) != 0 {
+		return fmt.Errorf("Operations %v are missing from graph.Operations()", missing)
+	}
+
+	return nil
 }
 
 func TestGraphWriteToAndImport(t *testing.T) {

diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 017bef9..1481a4d 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py

@@ -126,6 +126,12 @@
      lambda feed: [feed])]
 # pylint: enable=g-long-lambda
 
+
+def _convert_to_numpy_obj(numpy_dtype, obj):
+  """Explicitly convert obj based on numpy type except for string type."""
+  return numpy_dtype(obj) if numpy_dtype is not object else str(obj)
+
+
 def register_session_run_conversion_functions(tensor_type, fetch_function,
     feed_function=None, feed_function_for_partial_run=None):
   """Register fetch and feed conversion functions for `tf.Session.run()`.
@@ -1072,12 +1078,14 @@
                             'strings, lists, numpy ndarrays, or TensorHandles.')
 
           subfeed_dtype = subfeed_t.dtype.as_numpy_dtype
-          if isinstance(subfeed_val,
-                        int) and subfeed_dtype(subfeed_val) != subfeed_val:
+          if isinstance(subfeed_val, int) and _convert_to_numpy_obj(
+              subfeed_dtype, subfeed_val) != subfeed_val:
             raise TypeError(
-                'Type of feed value ' + str(subfeed_val) + ' is not'
-                ' compatible with Tensor type ' + str(subfeed_dtype) + '.'
-                ' Try explicitly setting the type of the feed tensor'
+                'Type of feed value ' + str(subfeed_val) + ' with type ' +
+                str(type(subfeed_val)) +
+                ' is not compatible with Tensor type ' +
+                str(subfeed_dtype) +
+                '. Try explicitly setting the type of the feed tensor'
                 ' to a larger type (e.g. int64).')
 
           is_tensor_handle_feed = isinstance(subfeed_val,

diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 3da03a7..a563f5e 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py

@@ -1737,6 +1737,12 @@
     server = server_lib.Server.create_local_server()
     self.runTestAddFunctionToSession(server.target)
 
+  def testAutoConvertAndCheckData(self):
+    with self.test_session() as sess:
+      a = array_ops.placeholder(dtype=dtypes.string)
+      with self.assertRaisesRegexp(
+          TypeError, 'Type of feed value 1 with type <(\w+) \'int\'> is not'):
+        sess.run(a, feed_dict={a: 1})
 
 class GraphMutationTest(test_util.TensorFlowTestCase):
 

diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 2315ad4..7897715 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD

@@ -535,6 +535,7 @@
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
+        "nomac",
         "oss_serial",
     ],
     deps = [

diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
index d72b95d..285671f 100644
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py

@@ -626,7 +626,7 @@
 
     self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
                                               mock_eval_spec))
-    mock_est.train.assert_called()
+    self.assertTrue(mock_est.train.called)
     mock_server.assert_not_called()
 
   def test_fail_with_empty_task_type(self):
@@ -836,7 +836,7 @@
     executor.run_master()
 
     mock_server.assert_not_called()
-    mock_est.train.assert_called()
+    self.assertTrue(mock_est.train.called)
 
   def test_fail_with_empty_task_type(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)

diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index 12f2592..b31486d 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py

@@ -52,7 +52,7 @@
   else:
     if _is_callable_object(fn):
       fn = fn.__call__
-    args = tf_inspect.getargspec(fn).args
+    args = tf_inspect.getfullargspec(fn).args
     if _is_bounded_method(fn):
       args.remove('self')
   return tuple(args)

diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 366025a..e06899f 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py

@@ -82,8 +82,8 @@
     return x + y, x - y
 
   # Building the graph.
-  a = tf.Constant([1.0])
-  b = tf.Constant([2.0])
+  a = tf.constant([1.0])
+  b = tf.constant([2.0])
   c, d = MyFunc(a, b, name='mycall')
   ```
   """

diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
index c66b4b3..2e73cef 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py

@@ -211,7 +211,7 @@
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          "imagenet" (pre-training on ImageNet),
+          'imagenet' (pre-training on ImageNet),
           or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.

diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index 4d5ac72..5f97c13 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py

@@ -350,7 +350,7 @@
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          "imagenet" (pre-training on ImageNet),
+          'imagenet' (pre-training on ImageNet),
           or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of
           `layers.Input()`)
@@ -536,6 +536,8 @@
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
 
 

diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index f7cdf2b..8ab4669 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py

@@ -164,7 +164,7 @@
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          "imagenet" (pre-training on ImageNet),
+          'imagenet' (pre-training on ImageNet),
           or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.

diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
index ab205aa..38dbbdc 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg16.py

@@ -70,8 +70,8 @@
   Arguments:
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
-     weights: one of `None` (random initialization),
-          "imagenet" (pre-training on ImageNet),
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
           or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.

diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
index 5e5179f..126c642 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg19.py

@@ -71,8 +71,8 @@
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
       weights: one of `None` (random initialization),
-          "imagenet" (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+         'imagenet' (pre-training on ImageNet),
+         or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified

diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/_impl/keras/applications/xception.py
index a9efd5d..8219831 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/_impl/keras/applications/xception.py

@@ -83,7 +83,7 @@
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          "imagenet" (pre-training on ImageNet),
+          'imagenet' (pre-training on ImageNet),
           or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
@@ -303,6 +303,8 @@
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
 
 

diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 712db33..6a74584 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py

@@ -104,13 +104,13 @@
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    self.supports_masking = True
     # Inheritance call order:
     # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
     super(Dropout, self).__init__(rate=rate,
                                   noise_shape=noise_shape,
                                   seed=seed,
                                   **kwargs)
+    self.supports_masking = True
 
   def call(self, inputs, training=None):
     if training is None:

diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index 1fe0435..bdb99c9 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py

@@ -47,6 +47,11 @@
                   'noise_shape': [3, 1]},
           input_shape=(3, 2))
 
+    # https://github.com/tensorflow/tensorflow/issues/14819
+    with self.test_session():
+      dropout = keras.layers.Dropout(0.5)
+      self.assertEqual(True, dropout.supports_masking)
+
     with self.test_session():
       testing_utils.layer_test(
           keras.layers.SpatialDropout1D,

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 1d8ca99..31d3bd1 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD

@@ -2084,6 +2084,10 @@
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     shard_count = 2,
+    tags = [
+        "no_gpu",
+        "no_oss",
+    ],
 )
 
 cuda_py_test(

diff --git a/tensorflow/python/kernel_tests/summary_image_op_test.py b/tensorflow/python/kernel_tests/summary_image_op_test.py
index d2152ab..4718827 100644
--- a/tensorflow/python/kernel_tests/summary_image_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_image_op_test.py

@@ -50,7 +50,6 @@
     self.assertProtoEquals(expected, image_summ)
 
   def testImageSummary(self):
-    np.random.seed(7)
     for depth in (1, 3, 4):
       for positive in False, True:
         with self.test_session(graph=ops.Graph()) as sess:

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index b9c89d6..21561f3 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py

@@ -1168,7 +1168,7 @@
       set_random_seed for its interaction with the graph-level random seed.
 
   Returns:
-    3-D float tensor of shape `[height, width, channels]`.
+    Adjusted image(s), same shape and DType as `image`.
 
   Raises:
     ValueError: if `max_delta` is invalid.
@@ -1275,30 +1275,9 @@
     orig_dtype = image.dtype
     flt_image = convert_image_dtype(image, dtypes.float32)
 
-    # TODO(zhengxq): we will switch to the fused version after we add a GPU
-    # kernel for that.
-    fused = os.environ.get('TF_ADJUST_SATURATION_FUSED', '')
-    fused = fused.lower() in ('true', 't', '1')
-
-    if fused:
-      return convert_image_dtype(
-          gen_image_ops.adjust_saturation(flt_image, saturation_factor),
-          orig_dtype)
-
-    hsv = gen_image_ops.rgb_to_hsv(flt_image)
-
-    hue = array_ops.slice(hsv, [0, 0, 0], [-1, -1, 1])
-    saturation = array_ops.slice(hsv, [0, 0, 1], [-1, -1, 1])
-    value = array_ops.slice(hsv, [0, 0, 2], [-1, -1, 1])
-
-    saturation *= saturation_factor
-    saturation = clip_ops.clip_by_value(saturation, 0.0, 1.0)
-
-    hsv_altered = array_ops.concat([hue, saturation, value], 2)
-    rgb_altered = gen_image_ops.hsv_to_rgb(hsv_altered)
-
-    return convert_image_dtype(rgb_altered, orig_dtype)
-
+    return convert_image_dtype(
+        gen_image_ops.adjust_saturation(flt_image, saturation_factor),
+        orig_dtype)
 
 def decode_image(contents, channels=None, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,

diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index d1554b3..4af9bd2 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py

@@ -281,6 +281,21 @@
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testBatchAdjustHue(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    delta = 0.25
+    y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_np, shape=x_shape)
+      y = image_ops.adjust_hue(x, delta)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def _adjustHueNp(self, x_np, delta_h):
     self.assertEqual(x_np.shape[-1], 3)
     x_v = x_np.reshape([-1, 3])
@@ -359,6 +374,89 @@
       self._adjustHueTf(x_np, delta_h)
 
 
+class FlipImageBenchmark(test.Benchmark):
+
+  def _benchmarkFlipLeftRight(self, device, cpu_count):
+    image_shape = [299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(
+                image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkFlipLeftRight_299_299_3_%s step_time: %.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkFlipLeftRight_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
+  def _benchmarkRandomFlipLeftRight(self, device, cpu_count):
+    image_shape = [299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(
+                image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.random_flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkRandomFlipLeftRight_299_299_3_%s step_time: %.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkRandomFlipLeftRight_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
+  def benchmarkFlipLeftRightCpu1(self):
+    self._benchmarkFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkFlipLeftRightCpuAll(self):
+    self._benchmarkFlipLeftRight("/cpu:0", None)
+
+  def benchmarkFlipLeftRightGpu(self):
+    self._benchmarkFlipLeftRight(test.gpu_device_name(), None)
+
+  def benchmarkRandomFlipLeftRightCpu1(self):
+    self._benchmarkRandomFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkRandomFlipLeftRightCpuAll(self):
+    self._benchmarkRandomFlipLeftRight("/cpu:0", None)
+
+  def benchmarkRandomFlipLeftRightGpu(self):
+    self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
+
+
 class AdjustHueBenchmark(test.Benchmark):
 
   def _benchmarkAdjustHue(self, device, cpu_count):
@@ -632,6 +730,21 @@
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testBatchSaturation(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    saturation_factor = 0.5
+    y_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_np, shape=x_shape)
+      y = image_ops.adjust_saturation(x, saturation_factor)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def _adjust_saturation(self, image, saturation_factor):
     image = ops.convert_to_tensor(image, name="image")
     orig_dtype = image.dtype

diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 08e3f83..51ab2ae 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py

@@ -39,8 +39,8 @@
           name=None):
   """Prints a list of tensors.
 
-  This is an identity op with the side effect of printing `data` when
-  evaluating.
+  This is an identity op (behaves like `tf.identity`) with the side effect
+  of printing `data` when evaluating.
 
   Note: This op prints to the standard error. It is not currently compatible
     with jupyter notebook (printing to the notebook *server's* output, not into
@@ -57,7 +57,7 @@
     name: A name for the operation (optional).
 
   Returns:
-    Same tensor as `input_`.
+    A `Tensor`. Has the same type and contents as `input_`.
   """
   return gen_logging_ops._print(input_, data, message, first_n, summarize, name)
 

diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 19a86df..fd96f7b 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py

@@ -27,6 +27,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -981,10 +982,11 @@
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
   Returns:
-    out_logits, out_labels: `Tensor` objects each with shape
+    out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
         `nn.softmax_cross_entropy_with_logits` (sampled softmax).
+    out_labels: A Tensor object with the same shape as `out_logits`.
   """
 
   if isinstance(weights, variables.PartitionedVariable):
@@ -1095,15 +1097,16 @@
 
     # Construct output logits and labels. The true labels/logits start at col 0.
     out_logits = array_ops.concat([true_logits, sampled_logits], 1)
-    # true_logits is a float tensor, ones_like(true_logits) is a float tensor
-    # of ones. We then divide by num_true to ensure the per-example labels sum
-    # to 1.0, i.e. form a proper probability distribution.
+
+    # true_logits is a float tensor, ones_like(true_logits) is a float
+    # tensor of ones. We then divide by num_true to ensure the per-example
+    # labels sum to 1.0, i.e. form a proper probability distribution.
     out_labels = array_ops.concat([
         array_ops.ones_like(true_logits) / num_true,
         array_ops.zeros_like(sampled_logits)
     ], 1)
 
-  return out_logits, out_labels
+    return out_logits, out_labels
 
 
 def nce_loss(weights,

diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index 5ea47ea..5e9e710 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py

@@ -93,7 +93,7 @@
     quantized_range = ((quantized_max - quantized_min) * range_adjust)
     range_scale = (quantized_range / number_of_steps)
     lowest_quantized = -(1 << (number_of_bits - 1))
-    result = np.array([(quantized_min + ((x - lowest_quantized) * range_scale))
+    result = np.array([(quantized_min + ((float(x) - lowest_quantized) * range_scale))
                        for x in quantized.flatten()])
     return result
 

diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
new file mode 100644
index 0000000..4bf3b35
--- /dev/null
+++ b/tensorflow/python/ops/quantized_ops_test.py

@@ -0,0 +1,57 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for quantized operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class QuantizedOpsTest(test.TestCase):
+
+  def __init__(self, method_name="runTest"):
+    super(QuantizedOpsTest, self).__init__(method_name)
+
+  def testQuantizeOp(self):
+    expected_output = [1, 1, 2, 127, 255, 255]
+    with self.test_session(use_gpu=False) as sess:
+      x = constant_op.constant([1.0, 1.25, 1.75, 127.0, 255.0, 500.0], shape=[6], dtype=dtypes.float32)
+      x_min = 0.0
+      x_max = 255.0
+      op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
+      value = sess.run(op)
+      self.assertArrayNear(expected_output, value.output, 0.1)
+
+  def testDequantizeOp(self):
+    expected_output = [1.0, 2.0, 4.0, 8.0, 16.0, 255.0]
+    inp = np.array([1, 2, 4, 8, 16, 255]).astype(np.uint8)
+    with self.test_session(use_gpu=False) as sess:
+      x = constant_op.constant(inp, shape=[6], dtype=dtypes.quint8)
+      x_min = 0.0
+      x_max = 255.0
+      op = array_ops.dequantize(x, x_min, x_max, mode="MIN_FIRST")
+      value = sess.run(op)
+      self.assertArrayNear(expected_output, value, 0.1)
+
+
+if __name__ == "__main__":
+  test.main()

diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 802b930..f0c28e7 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py

@@ -362,7 +362,13 @@
   The function returns the decayed learning rate.  It is computed as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * t)
+  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
   ```
 
   Example: decay 1/t with a rate of 0.5:
@@ -371,8 +377,9 @@
   ...
   global_step = tf.Variable(0, trainable=False)
   learning_rate = 0.1
-  k = 0.5
-  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
+  decay_steps = 1.0
+  decay_rate = 0.5
+  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (

diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 9ed1257..d14e710 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py

@@ -45,6 +45,26 @@
                if d.decorator_argspec is not None), _inspect.getargspec(target))
 
 
+def getfullargspec(obj):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getfullargspec and fallback to
+  inspect.getargspec in Python 2.
+
+  Args:
+    obj: A callable, possibly decorated.
+
+  Returns:
+    The `FullArgSpec` (`ArgSpec` in Python 2) that describes the signature of
+    the outermost decorator that changes the callable's signature. If the
+    callable is not decorated, `inspect.getfullargspec()`
+    (`inspect.getargspec()` in Python 2) will be called directly on the
+    callable.
+  """
+  spec_fn = getattr(_inspect, 'getfullargspec', getattr(_inspect, 'getargspec'))
+  decorators, target = tf_decorator.unwrap(obj)
+  return next((d.decorator_argspec for d in decorators
+               if d.decorator_argspec is not None), spec_fn(target))
+
+
 def getcallargs(func, *positional, **named):
   """TFDecorator-aware replacement for inspect.getcallargs.
 

diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 0d2cd4a..73b96de 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h

@@ -1132,7 +1132,7 @@
   //    space in order to speed up the convolution operation.
   //  algorithm: an integer to specify which algorithm should be used for the
   //    operation. kDefaultAlgorithm means the system will pick an algorithm
-  //    by default. The coding of the algorithm is be interpretted by the
+  //    by default. The coding of the algorithm is be interpreted by the
   //    underlying implementation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 611d50b..9b13a86 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl

@@ -130,6 +130,13 @@
       "//conditions:default": a,
   })
 
+def if_windows(a):
+  return select({
+      clean_dep("//tensorflow:windows"): a,
+      clean_dep("//tensorflow:windows_msvc"): a,
+      "//conditions:default": [],
+  })
+
 def if_linux_x86_64(a):
   return select({
       clean_dep("//tensorflow:linux_x86_64"): a,
@@ -1325,11 +1332,32 @@
           "//conditions:default": [":" + cc_library_name],
       }))
 
-def py_test(deps=[], **kwargs):
+# This macro is for running python tests against system installed pip package
+# on Windows.
+#
+# py_test is built as an exectuable python zip file on Windows, which contains all
+# dependencies of the target. Because of the C++ extensions, it would be very
+# inefficient if the py_test zips all runfiles, plus we don't need them when running
+# tests against system installed pip package. So we'd like to get rid of the deps
+# of py_test in this case.
+#
+# In order to trigger the tests without bazel clean after getting rid of deps,
+# we introduce the following :
+# 1. When --define=no_tensorflow_py_deps=true, the py_test depends on a marker
+#    file of the pip package, the test gets to rerun when the pip package change.
+#    Note that this only works on Windows. See the definition of
+#    //tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
+# 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
+def py_test(deps=[], data=[], **kwargs):
   native.py_test(
       deps=select({
           "//conditions:default": deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
+          clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
+      }),
+      data = data + select({
+          "//conditions:default": [],
+          clean_dep("//tensorflow:no_tensorflow_py_deps"):
+          ["//tensorflow/tools/pip_package:win_pip_package_marker"],
       }),
       **kwargs)
 
@@ -1354,7 +1382,7 @@
     additional_deps = additional_deps + tf_additional_xla_deps_py()
   if grpc_enabled:
     additional_deps = additional_deps + tf_additional_grpc_deps_py()
-  native.py_test(
+  py_test(
       name=name,
       size=size,
       srcs=srcs,
@@ -1364,13 +1392,10 @@
       visibility=[clean_dep("//tensorflow:internal")],
       shard_count=shard_count,
       data=data,
-      deps=select({
-          "//conditions:default": [
-              clean_dep("//tensorflow/python:extra_py_tests_deps"),
-              clean_dep("//tensorflow/python:gradient_checker"),
+      deps=[
+            clean_dep("//tensorflow/python:extra_py_tests_deps"),
+            clean_dep("//tensorflow/python:gradient_checker"),
           ] + additional_deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
-      }),
       flaky=flaky,
       srcs_version="PY2AND3")
 

diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 9809ad5..ecab6f8 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc

@@ -530,7 +530,7 @@
   }
 
   // Capture overall inference time without stat logging overhead. This is the
-  // timing data that can be compared to other libaries.
+  // timing data that can be compared to other libraries.
   SleepSeconds(inter_benchmark_sleep_seconds);
   int64 no_stat_time_us = 0;
   int64 no_stat_num_runs = 0;

diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 2d46ccb..7591ecc 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu

@@ -1,8 +1,8 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
-# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
 # /usr/local/cuda
 RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64

diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu_clang b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
index 0ecd8c7..438a7ec 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu_clang
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu_clang

@@ -1,8 +1,8 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Ilya Biryukov <ibiryukov@google.com>"
 
-# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
 # /usr/local/cuda
 RUN cp /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64

diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index a37cf22..82042b9 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh

@@ -296,19 +296,12 @@
     die "FAILED to create virtualenv directory: ${VIRTUALENV_DIR}"
   fi
 
-  if [[ ${PYTHON_BIN_PATH} == *"python3.6"* ]]; then
-    "${PYTHON_BIN_PATH}" -m venv "${VIRTUALENV_FLAGS}" \
-      "${VIRTUALENV_DIR}" || \
-      die "FAILED: Unable to create virtualenv"
-  else
-    # Verify that virtualenv exists
-    if [[ -z $(which virtualenv) ]]; then
-      die "FAILED: virtualenv not available on path"
-    fi
-    virtualenv ${VIRTUALENV_FLAGS} \
-      -p "${PYTHON_BIN_PATH}" "${VIRTUALENV_DIR}" || \
-      die "FAILED: Unable to create virtualenv"
-  fi
+  # Use the virtualenv from the default python version (i.e., python-virtualenv)
+  # to create the virtualenv directory for testing. Use the -p flag to specify
+  # the python version inside the to-be-created virtualenv directory.
+  ${PYTHON_BIN_PATH} -m virtualenv -p "${PYTHON_BIN_PATH}" ${VIRTUALENV_FLAGS} \
+    "${VIRTUALENV_DIR}" || \
+    die "FAILED: Unable to create virtualenv"
 
   source "${VIRTUALENV_DIR}/bin/activate" || \
     die "FAILED: Unable to activate virtualenv in ${VIRTUALENV_DIR}"
@@ -350,7 +343,7 @@
   then
     echo "Smoke test of tensorflow install in clean virtualenv PASSED."
   else
-    echo "Smoke test of tensroflow install in clean virtualenv FAILED."
+    echo "Smoke test of tensorflow install in clean virtualenv FAILED."
     return 1
   fi
 

diff --git a/tensorflow/tools/ci_build/builds/print_build_info.sh b/tensorflow/tools/ci_build/builds/print_build_info.sh
index 7c43419..e366abf 100755
--- a/tensorflow/tools/ci_build/builds/print_build_info.sh
+++ b/tensorflow/tools/ci_build/builds/print_build_info.sh

@@ -88,7 +88,7 @@
 # Print info
 echo "TF_BUILD_INFO = {"\
 "container_type: \"${CONTAINER_TYPE}\", "\
-"command: \"${COMMAND[@]}\", "\
+"command: \"${COMMAND[*]}\", "\
 "source_HEAD: \"${TF_HEAD}\", "\
 "source_remote_origin: \"${TF_FETCH_URL}\", "\
 "OS: \"${OS}\", "\

diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index 358f82a..caa3a40 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh

@@ -82,11 +82,11 @@
 TF_LFLAGS=( $("${PYTHON_BIN_PATH}" \
 	      -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
 
-if [[ -z "${TF_CFLAGS}" || -z "${TF_LFLAGS}" ]]; then
+if [[ -z "${TF_CFLAGS[*]}" || -z "${TF_LFLAGS[*]}" ]]; then
   die "FAILED to determine TensorFlow compilation or linking flags"
 else
-  echo "TensorFlow compile flags: ${TF_CFLAGS[@]}"
-  echo "TensorFlow link flags: ${TF_LFLAGS[@]}"
+  echo "TensorFlow compile flags: ${TF_CFLAGS[*]}"
+  echo "TensorFlow link flags: ${TF_LFLAGS[*]}"
 fi
 
 # Check g++ availability

diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index 6e7b752..cfeaebd 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh

@@ -45,7 +45,7 @@
       # This export only works within the brackets, so it is isolated to one
       # single command.
       export CUDA_VISIBLE_DEVICES=$i
-      echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES"
+      echo "Running test $* on GPU $CUDA_VISIBLE_DEVICES"
       $@
     )
     return_code=$?

diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 4ab307c..9640810 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh

@@ -48,6 +48,7 @@
     git \
     libcurl4-openssl-dev \
     libtool \
+    libssl-dev \
     mlocate \
     openjdk-8-jdk \
     openjdk-8-jre-headless \

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b8ed1ab..da58ac2 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh

@@ -27,6 +27,9 @@
 pip2 install wheel
 pip3 install wheel
 
+pip2 install virtualenv
+pip3 install virtualenv
+
 # Install six.
 pip2 install --upgrade six==1.10.0
 pip3 install --upgrade six==1.10.0

diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 479242a..9881bd9 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh

@@ -39,6 +39,8 @@
 fi
 
 set -e
+pip3.5 install --upgrade virtualenv
+
 # Install six.
 pip3.5 install --upgrade absl-py
 pip3.5 install --upgrade six==1.10.0

diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index ec7d9bf..1ca12c6 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh

@@ -36,6 +36,8 @@
 which pip3.6
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
+pip3 install --upgrade virtualenv
+
 set -e
 # Install six.
 pip3 install --upgrade absl-py

diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
index df196f8..ac83e90 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh

@@ -28,6 +28,8 @@
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=8.0
+export TF_CUDNN_VERSION=6
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
 yes "" | $PYTHON_BIN_PATH configure.py

diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
index abd256a..6b80f44 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh

@@ -28,6 +28,8 @@
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=8.0
+export TF_CUDNN_VERSION=6
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
 yes "" | $PYTHON_BIN_PATH configure.py

diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 88116d9..1bd1852 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh

@@ -82,6 +82,7 @@
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
+  --copt=-O3
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'

diff --git a/tensorflow/tools/ci_build/remote/remote_docker_build.sh b/tensorflow/tools/ci_build/remote/remote_docker_build.sh
index 3ac6840..e00a66a 100755
--- a/tensorflow/tools/ci_build/remote/remote_docker_build.sh
+++ b/tensorflow/tools/ci_build/remote/remote_docker_build.sh

@@ -124,7 +124,7 @@
 
 
 function publish_tf_image {
-  $gcr_tf_image="gcr.io/tensorflow/${tf_image}"
+  gcr_tf_image="gcr.io/tensorflow/${tf_image}"
   docker tag $tf_image $gcr_tf_image
   gcloud docker -- push $gcr_tf_image
 }

diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 44b6d52..8d50250 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh

@@ -96,10 +96,6 @@
 
 exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 
-function clean_output_base() {
-  bazel clean --expunge
-}
-
 function run_configure_for_cpu_build {
   # Due to a bug in Bazel: https://github.com/bazelbuild/bazel/issues/2182
   # yes "" | ./configure doesn't work on Windows, so we set all the
@@ -115,7 +111,7 @@
     export TF_NEED_MKL=0
   fi
   export TF_NEED_VERBS=0
-  export TF_NEED_GCP=0
+  export TF_NEED_GCP=1
   export TF_NEED_HDFS=0
   export TF_NEED_OPENCL_SYCL=0
   echo "" | ./configure

diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 4a65369..f88e717 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh

@@ -36,12 +36,6 @@
 export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python.exe"
 export PYTHON_LIB_PATH="C:/Program Files/Anaconda3/lib/site-packages"
 
-# Set Python path for cc_configure.bzl
-export BAZEL_PYTHON="C:/Program Files/Anaconda3/python.exe"
-
-# Set Visual Studio path
-export BAZEL_VS="C:/Program Files (x86)/Microsoft Visual Studio 14.0"
-
 # Add python into PATH, it's needed because gen_git_source.py uses
 # '/usr/bin/env python' as a shebang
 export PATH="/c/Program Files/Anaconda3:$PATH"
@@ -53,13 +47,3 @@
 export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin:$PATH"
 export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/extras/CUPTI/libx64:$PATH"
 export PATH="/c/tools/cuda/bin:$PATH"
-
-# Set the common build options on Windows
-export BUILD_OPTS='--config=monolithic --copt=-w --host_copt=-w --verbose_failures --experimental_ui'
-
-# Build TF with wrapper-less CROSSTOOL
-# TODO(pcloudy): Remove this after wrapper-less CROSSTOOL becomes default
-export NO_MSVC_WRAPPER=1
-
-export USE_DYNAMIC_CRT=1
-

diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
index 8c41934..748a961 100644
--- a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh

@@ -42,8 +42,6 @@
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
-clean_output_base
-
 run_configure_for_cpu_build
 
 # Compliling the following test is extremely slow with -c opt
@@ -54,5 +52,5 @@
   # We need to strip \r so that the result could be store into a variable under MSYS
   tr '\r' ' ')
 
-bazel test $BUILD_OPTS -k $slow_compiling_test --test_output=errors
-bazel test -c opt $BUILD_OPTS -k $passing_tests --test_output=errors
+bazel test -k $slow_compiling_test --test_output=errors
+bazel test -c opt -k $passing_tests --test_output=errors

diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 8520ca8..31b4226 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh

@@ -44,9 +44,7 @@
 
 run_configure_for_cpu_build
 
-clean_output_base
-
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
@@ -60,11 +58,8 @@
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
-# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
-  --test_env=TF_SAVER_LENIENT_NAMES=True \
   //${PY_TEST_DIR}/tensorflow/python/...

diff --git a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
index 3fd960d..f26f872 100644
--- a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh

@@ -56,5 +56,5 @@
 
 # TODO(pcloudy): There is a bug in Bazel preventing build with GPU support without -c opt
 # Re-enable this test after it is fixed.
-# bazel test --config=win-cuda $BUILD_OPTS -k $slow_compiling_test --test_output=errors
-bazel test -c opt --config=win-cuda $BUILD_OPTS -k $passing_tests --test_output=errors
+# bazel test --config=win-cuda -k $slow_compiling_test --test_output=errors
+bazel test -c opt --config=win-cuda -k $passing_tests --test_output=errors

diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 47ca42d..922bb67 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh

@@ -44,9 +44,7 @@
 
 run_configure_for_gpu_build
 
-clean_output_base
-
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
@@ -61,11 +59,8 @@
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
 # GPU tests are very flaky when running concurrently, so set local_test_jobs=1
-# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
-# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,no_oss \
-  --test_env=TF_SAVER_LENIENT_NAMES=True \
   --local_test_jobs=1 --build_tests_only //${PY_TEST_DIR}/tensorflow/python/...

diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index 9ac3613..80f2b59 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh

@@ -44,13 +44,12 @@
 export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/java:libtensorflow_jni.so"
 export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:jnilicenses_generate"
 
-clean_output_base
 run_configure_for_cpu_build
 
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt ${BUILD_OPTS} \
+bazel build -c opt \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \

diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
index a94a627..88333de 100755
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh

@@ -28,6 +28,8 @@
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=8.0
+export TF_CUDNN_VERSION=6
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
 yes "" | $PYTHON_BIN_PATH configure.py

diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 3525c75..0a6860e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel

@@ -69,11 +69,8 @@
     rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
 
 # Download and build TensorFlow.
-
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout r1.4
 WORKDIR /tensorflow
+RUN git clone --branch=r1.4 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add

diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 041f459..4164cc3 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu

@@ -1,11 +1,20 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+FROM nvidia/cuda:9.0-base-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
         curl \
         git \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libpng12-dev \
@@ -17,12 +26,11 @@
         unzip \
         zip \
         zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
         wget \
         && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/* && \
+    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
 
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
@@ -70,18 +78,16 @@
     rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
 
 # Download and build TensorFlow.
-
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout r1.4
 WORKDIR /tensorflow
+RUN git clone --branch=r1.4 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1
-
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
 
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \

diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index e212d10..b6682cd 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu

@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn6-runtime-ubuntu16.04
+FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 

diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 80a07b9..e7de7df 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh

@@ -265,7 +265,7 @@
   DOCKERFILE="${TMP_DIR}/Dockerfile"
 
   # Modify the devel Dockerfile to specify the git branch
-  sed -r "s/([\s]*git checkout )(.*)/\1${TF_DOCKER_BUILD_DEVEL_BRANCH}/g" \
+  sed "s/^RUN git clone --branch=.* --depth=1/RUN git clone --branch=${TF_DOCKER_BUILD_DEVEL_BRANCH} --depth=1/" \
       "${ORIG_DOCKERFILE}" > "${DOCKERFILE}"
 
   # Modify python/pip version if necessary.

diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index f950f19..003f972 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py

@@ -199,12 +199,12 @@
       add_to[key] = add_from[key]
 
 
-# Exclude some libaries in contrib from the documentation altogether.
+# Exclude some libraries in contrib from the documentation altogether.
 def _get_default_private_map():
   return {'tf.test': ['mock']}
 
 
-# Exclude members of some libaries.
+# Exclude members of some libraries.
 def _get_default_do_not_descend_map():
   # TODO(wicke): Shrink this list once the modules get sealed.
   return {

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 33af453..d80d5ec 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD

@@ -6,6 +6,7 @@
 load(
     "//tensorflow:tensorflow.bzl",
     "if_not_windows",
+    "if_windows",
     "transitive_hdrs",
 )
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
@@ -194,3 +195,23 @@
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
 )
+
+# A genrule for generating a marker file for the pip package on Windows
+#
+# This only works on Windows, because :simple_console_for_windows is a
+# python zip file containing everything we need for building the pip package.
+# However, on other platforms, due to https://github.com/bazelbuild/bazel/issues/4223,
+# when C++ extensions change, this generule doesn't rebuild.
+genrule(
+    name = "win_pip_package_marker",
+    srcs = if_windows([
+        ":build_pip_package",
+        ":simple_console_for_windows",
+    ]),
+    outs = ["win_pip_package_marker_file"],
+    cmd = select({
+        "//conditions:default": "touch $@",
+        "//tensorflow:windows": "md5sum $(locations :build_pip_package) $(locations :simple_console_for_windows) > $@",
+    }),
+    visibility = ["//visibility:public"],
+)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 8249703..f5203bc 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh

@@ -24,7 +24,7 @@
 function cp_external() {
   local src_dir=$1
   local dest_dir=$2
-  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*'`; do
+  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*org_tensorflow*'`; do
     cp -R "$f" "$dest_dir"
   done
 }
@@ -92,7 +92,6 @@
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow/tensorflow \
       "${TMPDIR}"
     mkdir "${TMPDIR}/external"
-    # Note: this makes an extra copy of org_tensorflow.
     cp_external \
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
       "${TMPDIR}/external"
@@ -123,7 +122,6 @@
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
         "${TMPDIR}"
       mkdir "${TMPDIR}/external"
-      # Note: this makes an extra copy of org_tensorflow.
       cp_external \
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
         "${TMPDIR}/external"

diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index cc46dd5..22e1584 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py

@@ -42,6 +42,7 @@
     "//tensorflow/python:extra_py_tests_deps",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
+    "//tensorflow/tools/pip_package:win_pip_package_marker",
     "//tensorflow/python:test_ops_2",
     "//tensorflow/python:tf_optimizer",
     "//tensorflow/python:compare_test_proto_py",

diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index ecb29a6..f0bb59a 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc

@@ -132,6 +132,7 @@
       FILE* f = fopen(path.c_str(), "w");
       if (f == nullptr) return -1;
       if (fwrite(data.c_str(), 1, data.size(), f) != data.size()) {
+        fclose(f);
         return -1;
       }
       if (fclose(f) != 0) {

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b71f5dc..046c2b2 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl

@@ -74,11 +74,11 @@
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
-          "https://github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
+          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/aab753280e83137ba955f8f19d72cb6aaba545ef.tar.gz",
+          "https://github.com/01org/mkl-dnn/archive/aab753280e83137ba955f8f19d72cb6aaba545ef.tar.gz",
       ],
-      sha256 = "0d529ad4c49dc799e6df07c2b88b115d0668735da15fb3b3862d28d33fa68165",
-      strip_prefix = "mkl-dnn-b01e3a55a07be62172e713bcd2644c5176360212",
+      sha256 = "fb67f255a96bd4ad39b8dd104eca5aa92200c95c1ed36e59641e6c0478eefd11",
+      strip_prefix = "mkl-dnn-aab753280e83137ba955f8f19d72cb6aaba545ef",
       build_file = str(Label("//third_party/mkl_dnn:mkldnn.BUILD")),
   )
 
@@ -95,11 +95,11 @@
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/b6e6d0cf6a77.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/b6e6d0cf6a77.tar.gz",
       ],
-      sha256 = "61d8b6fc4279dd1dda986fb1677d15e3d641c07a3ea5abe255790b1f0c0c14e9",
-      strip_prefix = "eigen-eigen-429aa5254200",
+      sha256 = "0840c497f2749b5e90bda666aab96be6da90dc75b4e21ca9843cae69b7fed52a",
+      strip_prefix = "eigen-eigen-b6e6d0cf6a77",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 

diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index e311c7e..4def6f9 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD

@@ -10,6 +10,7 @@
     "/DHAVE_CONFIG_H",
     "/DCURL_DISABLE_FTP",
     "/DCURL_DISABLE_NTLM",
+    "/DCURL_DISABLE_PROXY",
     "/DHAVE_LIBZ",
     "/DHAVE_ZLIB_H",
     # Defining _USING_V110_SDK71_ is hackery to defeat curl's incorrect
@@ -23,6 +24,8 @@
     "lib/asyn-thread.c",
     "lib/inet_ntop.c",
     "lib/system_win32.c",
+    "lib/vtls/schannel.c",
+    "lib/idn_win32.c",
 ]
 
 cc_library(
@@ -276,6 +279,7 @@
             "-DCURL_MAX_WRITE_SIZE=65536",
         ],
     }),
+    defines = ["CURL_STATICLIB"],
     includes = ["include"],
     linkopts = select({
         "@org_tensorflow//tensorflow:android": [
@@ -289,10 +293,16 @@
         ],
         "@org_tensorflow//tensorflow:ios": [],
         "@org_tensorflow//tensorflow:windows": [
-            "-Wl,ws2_32.lib",
+            "-DEFAULTLIB:ws2_32.lib",
+            "-DEFAULTLIB:advapi32.lib",
+            "-DEFAULTLIB:crypt32.lib",
+            "-DEFAULTLIB:Normaliz.lib",
         ],
         "@org_tensorflow//tensorflow:windows_msvc": [
-            "-Wl,ws2_32.lib",
+            "-DEFAULTLIB:ws2_32.lib",
+            "-DEFAULTLIB:advapi32.lib",
+            "-DEFAULTLIB:crypt32.lib",
+            "-DEFAULTLIB:Normaliz.lib",
         ],
         "//conditions:default": [
             "-lrt",
@@ -438,12 +448,22 @@
         "#  include \"lib/config-win32.h\"",
         "#  define BUILDING_LIBCURL 1",
         "#  define CURL_DISABLE_CRYPTO_AUTH 1",
+        "#  define CURL_DISABLE_DICT 1",
+        "#  define CURL_DISABLE_FILE 1",
+        "#  define CURL_DISABLE_GOPHER 1",
         "#  define CURL_DISABLE_IMAP 1",
         "#  define CURL_DISABLE_LDAP 1",
         "#  define CURL_DISABLE_LDAPS 1",
         "#  define CURL_DISABLE_POP3 1",
         "#  define CURL_PULL_WS2TCPIP_H 1",
-        "#  define HTTP_ONLY 1",
+        "#  define CURL_DISABLE_SMTP 1",
+        "#  define CURL_DISABLE_TELNET 1",
+        "#  define CURL_DISABLE_TFTP 1",
+        "#  define CURL_PULL_WS2TCPIP_H 1",
+        "#  define USE_WINDOWS_SSPI 1",
+        "#  define USE_WIN32_IDN 1",
+        "#  define USE_SCHANNEL 1",
+        "#  define WANT_IDN_PROTOTYPES 1",
         "#elif defined(__APPLE__)",
         "#  define HAVE_FSETXATTR_6 1",
         "#  define HAVE_SETMODE 1",

diff --git a/third_party/pcre.BUILD b/third_party/pcre.BUILD
index 68aadd1..e2cdec4 100644
--- a/third_party/pcre.BUILD
+++ b/third_party/pcre.BUILD

@@ -50,12 +50,12 @@
         "-DNEWLINE=10",
         "-DNO_RECURSE",
         "-DPARENS_NEST_LIMIT=50",
-        "-DPCRE_STATIC=1",
         "-DPOSIX_MALLOC_THRESHOLD=10",
         "-DSTDC_HEADERS=1",
         "-DSUPPORT_UCP",
         "-DSUPPORT_UTF",
     ],
+    defines = ["PCRE_STATIC=1"],
     includes = ["."],
     visibility = ["@swig//:__pkg__"],  # Please use RE2
     alwayslink = 1,
commit	90e42f3ac8c43474633136af4242dca04b6a1e09	[log] [tgz]
author	Dandelion Man? <dandelion@google.com>	Fri Dec 15 18:15:07 2017 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	Fri Dec 15 18:19:09 2017 -0800
tree	64dbb44252c89c847bee86db07cea5aa94072e7c
parent	713d45278491d792c525344de6038a61ebcb2136 [diff]