Merge changes from github.
Change: 137532946
diff --git a/README.md b/README.md
index 49b1983..1372f20 100644
--- a/README.md
+++ b/README.md
@@ -33,10 +33,10 @@
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
 * [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))
 
 #### *Try your first TensorFlow program*
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 7d94f8a..a7dd248 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -15,6 +15,7 @@
 
 # Options
 option(tensorflow_VERBOSE "Enable for verbose output" OFF)
+option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_BUILD_CC_EXAMPLE "Build the C++ tutorial example" ON)
@@ -48,8 +49,13 @@
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
+  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
+  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
+  add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
   # Suppress warnings to reduce build log size.
   add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
+  add_definitions(/wd4099 /wd4146 /wd4267 /wd4305 /wd4307)
+  add_definitions(/wd4715 /wd4722 /wd4723 /wd4838 /wd4309 /wd4334)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
 endif()
 
@@ -80,7 +86,16 @@
     ${protobuf_STATIC_LIBRARIES}
 )
 set(tensorflow_EXTERNAL_DEPENDENCIES
-  gif_copy_headers_to_destination png_copy_headers_to_destination jpeg_copy_headers_to_destination jsoncpp farmhash_copy_headers_to_destination highwayhash_copy_headers_to_destination protobuf eigen)
+    zlib_copy_headers_to_destination
+    gif_copy_headers_to_destination
+    png_copy_headers_to_destination
+    jpeg_copy_headers_to_destination
+    jsoncpp
+    farmhash_copy_headers_to_destination
+    highwayhash_copy_headers_to_destination
+    protobuf
+    eigen
+)
 
 include_directories(
     # Source and generated code.
@@ -118,19 +133,67 @@
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
 endif()
 
+if (tensorflow_ENABLE_GPU)
+  if (WIN32)
+    find_package(CUDA 8.0 REQUIRED)
+
+    # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
+    # CUDA_NVCC_FLAGS and cuda_config.h below
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
+    set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
+    include_directories(${CUDA_INCLUDE})
+    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.5,5.2)
+
+    # add cudnn
+    include_directories(${CUDNN_HOME})
+    set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDNN_HOME}/lib/x64/cudnn.lib)
+
+    # create cuda_config.h
+    FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
+      "#ifndef CUDA_CUDA_CONFIG_H_\n"
+      "#define CUDA_CUDA_CONFIG_H_\n"
+      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+      "#define TF_CUDA_VERSION \"64_80\"\n"
+      "#define TF_CUDNN_VERSION \"64_5\"\n"
+      "#endif  // CUDA_CUDA_CONFIG_H_\n"
+    )
+
+    # tf assumes in various places header files to be in cuda/include. On windows the cuda sdk
+    # installs them under cuda/version/include and to avoid that we need to change tf we copy a
+    # few files to cuda/include
+    FILE(COPY
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_HOME}/include/cudnn.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
+      DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
+    )
+    include_directories(${tensorflow_source_dir}/third_party/gpus)
+    # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+  endif()
+endif()
+
 # Let's get to work!
 include(tf_core_framework.cmake)
 include(tf_tools.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
 # include(tf_stream_executor.cmake)
+if (tensorflow_ENABLE_GPU)
+  if (WIN32)
+    include(tf_stream_executor.cmake)
+  endif()
+endif()
+
 include(tf_core_cpu.cmake)
 include(tf_models.cmake)
 include(tf_core_ops.cmake)
 include(tf_core_direct_session.cmake)
+include(tf_core_kernels.cmake)
 if(tensorflow_ENABLE_GRPC_SUPPORT)
   include(tf_core_distributed_runtime.cmake)
 endif()
-include(tf_core_kernels.cmake)
+
 include(tf_cc_ops.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
   include(tf_tutorials.cmake)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index daf5101..a3510b5 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -15,14 +15,13 @@
 
 The CMake files in this directory can build the core TensorFlow runtime, an
 example C++ binary, and a PIP package containing the runtime and Python
-bindings. Currently, only CPU builds are supported, but we are working on
-providing a GPU build as well.
+bindings.
 
 Note: Windows support is in an **alpha** state, and we welcome your feedback.
 
 ### Pre-requisites
 
-* CMake version 3.1 or later
+* CMake version 3.1 up to 3.6
 
 * [Git](http://git-scm.com)
 
@@ -45,21 +44,13 @@
   - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.continuum.io/downloads)
   - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
   - [swigwin-3.0.10](http://www.swig.org/download.html)
-
+  - [NVidia CUDA Toolkit 8.0] (https://developer.nvidia.com/cuda-downloads)
+  - [NVidia CUDNN 5.1] (https://developer.nvidia.com/cudnn)
 * Ubuntu 14.04
   - Makefile generator
   - Docker 1.9.1 (for automated testing)
 
 ### Current known limitations
-
-* CPU support only
-
-  - We are in the process of porting the GPU code in
-    `tensorflow/stream_executor` to build with CMake and work on non-POSIX
-    platforms.
-
-* Additional limitations for the Windows build:
-
   - The Python package supports **Python 3.5 only**, because that is the only
     version for which standard Python binaries exist and those binaries are
     compatible with the TensorFlow runtime. (On Windows, the standard Python
@@ -114,6 +105,17 @@
      D:\temp> "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvarsall.bat"
      ```
 
+   * When building with GPU support after installing the CUDNN zip file from NVidia, append its 
+     bin directory to your PATH environment variable.
+     In case TensorFlow fails to find the CUDA dll's during initialization, check your PATH environment variable. 
+     It should contain the directory of the CUDA dlls and the directory of the CUDNN dll.
+     For example:
+     
+     ```
+     D:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin
+     D:\local\cuda\bin
+     ```
+
    * We assume that `cmake` and `git` are installed and in your `%PATH%`. If
      for example `cmake` is not in your path and it is installed in
      `C:\Program Files (x86)\CMake\bin\cmake.exe`, you can add this directory
@@ -145,9 +147,14 @@
    D:\...\build> cmake .. -A x64 -DCMAKE_BUILD_TYPE=Release ^
    More? -DSWIG_EXECUTABLE=C:/tools/swigwin-3.0.10/swig.exe ^
    More? -DPYTHON_EXECUTABLE=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/python.exe ^
-   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib
+   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib 
    ```
-
+   To build with GPU support add "^" at the end of the last line above following with:
+   ```
+   More? -Dtensorflow_ENABLE_GPU=ON ^
+   More? -DCUDNN_HOME="D:\...\cudnn"
+   ```
+    
    Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
    configuration that you choose when invoking `msbuild`. The known-good
    values are `Release` and `RelWithDebInfo`. The `Debug` build type is
@@ -184,6 +191,11 @@
      SSL support (for making secure HTTP requests) in the TensorFlow runtime.
      This support is incomplete, and will be used for Google Cloud Storage
      support.
+     
+   * `-Dtensorflow_ENABLE_GPU=(ON|OFF)`. Defaults to `OFF`. Include
+     GPU support. If GPU is enabled you need to install the CUDA 8.0 Toolkit and CUDNN 5.1.
+     CMake will expect the location of CUDNN in -DCUDNN_HOME=path_you_unziped_cudnn.
+    
 
 4. Invoke MSBuild to build TensorFlow.
 
@@ -202,7 +214,6 @@
    D:\...\build> MSBuild /p:Configuration=Release tf_python_build_pip_package.vcxproj
    ```
 
-
 Linux Continuous Integration build
 ==================================
 
diff --git a/tensorflow/contrib/cmake/setup.py b/tensorflow/contrib/cmake/setup.py
index bd1dade..1edc173 100644
--- a/tensorflow/contrib/cmake/setup.py
+++ b/tensorflow/contrib/cmake/setup.py
@@ -26,7 +26,7 @@
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-_VERSION = '0.11.0rc0-cmake-experimental'
+_VERSION = '0.11.0rc1-cmake-experimental'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index 143f2e7..f850c40 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -21,13 +21,27 @@
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
 )
-
 list(REMOVE_ITEM tf_core_cpu_srcs ${tf_core_cpu_exclude_srcs}) 
+
 # We need to include stubs for the GPU tracer, which are in the exclude glob.
 list(APPEND tf_core_cpu_srcs
      "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.cc"
      "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.h"
 )
 
+if (tensorflow_ENABLE_GPU)
+  file(GLOB_RECURSE tf_core_gpu_srcs
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu/cupti_wrapper.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
+  )
+  file(GLOB_RECURSE tf_core_gpu_exclude_srcs
+     "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
+     "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
+  )
+  list(REMOVE_ITEM tf_core_gpu_srcs ${tf_core_gpu_exclude_srcs})
+  list(APPEND tf_core_cpu_srcs ${tf_core_gpu_srcs})
+endif()
+
 add_library(tf_core_cpu OBJECT ${tf_core_cpu_srcs})
 add_dependencies(tf_core_cpu tf_core_framework)
diff --git a/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake b/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
index cf41e92..b3c06d2 100644
--- a/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
+++ b/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
@@ -38,9 +38,11 @@
     $<TARGET_OBJECTS:tf_core_ops>
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_core_distributed_runtime>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
 target_link_libraries(grpc_tensorflow_server PUBLIC
     tf_protos_cc
+    ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
 )
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 6927ecf..19b57f0 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -38,6 +38,7 @@
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/metrics/kernels/set_kernels.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/metrics/ops/set_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
@@ -83,7 +84,7 @@
 
 if(WIN32)
   file(GLOB_RECURSE tf_core_kernels_windows_exclude_srcs
-      # Not currently working on Windows:
+      # not working on windows yet
       "${tensorflow_source_dir}/tensorflow/core/kernels/depthwise_conv_op.cc"  # Cannot find symbol: tensorflow::LaunchConv2DOp<struct Eigen::ThreadPoolDevice, double>::launch(...).
       "${tensorflow_source_dir}/tensorflow/core/kernels/fact_op.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/immutable_constant_op.cc"
@@ -93,14 +94,38 @@
       "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_matmul_op.h"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
+      "${tensorflow_source_dir}/tensorflow/core/kernels/svd*.cc"
+      "${tensorflow_source_dir}/tensorflow/core/kernels/avgpooling_op.*"
   )
   list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
 endif(WIN32)
 
+file(GLOB_RECURSE tf_core_gpu_kernels_srcs
+   "${tensorflow_source_dir}/tensorflow/core/kernels/*.cu.cc"
+   "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/*.cu.cc"
+)
+
+if(WIN32)
+  file(GLOB_RECURSE tf_core_gpu_kernels_exclude_srcs
+      # not working on windows yet
+      "${tensorflow_source_dir}/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc"
+  )
+  list(REMOVE_ITEM tf_core_gpu_kernels_srcs ${tf_core_gpu_kernels_exclude_srcs})
+endif(WIN32)
+
 add_library(tf_core_kernels OBJECT ${tf_core_kernels_srcs})
+add_dependencies(tf_core_kernels tf_core_cpu)
 
 if(WIN32)
   target_compile_options(tf_core_kernels PRIVATE /MP)
+  if (tensorflow_ENABLE_GPU)
+    set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+    set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
+    cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
+    set_target_properties(${tf_core_gpu_kernels_lib}
+                          PROPERTIES DEBUG_POSTFIX ""
+                          COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
+    )
+    add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
+  endif()
 endif()
-
-add_dependencies(tf_core_kernels tf_core_cpu)
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index d1029d3..8cdecf7 100644
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -302,12 +302,14 @@
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 target_include_directories(pywrap_tensorflow PUBLIC
     ${PYTHON_INCLUDE_DIR}
     ${NUMPY_INCLUDE_DIR}
 )
 target_link_libraries(pywrap_tensorflow
+    ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
     tf_protos_cc
     ${PYTHON_LIBRARIES}
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index b121ddf..bf45bb0 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -47,11 +47,17 @@
     "${tensorflow_source_dir}/tensorflow/stream_executor/platform/default/*.h"
 )
 
+if (tensorflow_ENABLE_GPU)    
+    file(GLOB tf_stream_executor_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
+    )
+    list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
+endif()    
+
 #file(GLOB_RECURSE tf_stream_executor_test_srcs
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.cc"
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.h"
 #)
-#
 #list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs}) 
 
 add_library(tf_stream_executor OBJECT ${tf_stream_executor_srcs})
diff --git a/tensorflow/contrib/cmake/tf_tutorials.cmake b/tensorflow/contrib/cmake/tf_tutorials.cmake
index 8a23d02..d6547d6 100644
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@@ -12,9 +12,11 @@
     $<TARGET_OBJECTS:tf_cc_ops>
     $<TARGET_OBJECTS:tf_core_ops>
     $<TARGET_OBJECTS:tf_core_direct_session>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
 target_link_libraries(tf_tutorials_example_trainer PUBLIC
     tf_protos_cc
+    ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
 )
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index f07e8a3..7f19d42 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -942,6 +942,7 @@
     kernel_size,
     stride=1,
     padding='SAME',
+    data_format=DATA_FORMAT_NHWC,
     activation_fn=nn.relu,
     normalizer_fn=None,
     normalizer_params=None,
@@ -961,7 +962,9 @@
   second variable called 'biases' is added to the result of the operation.
 
   Args:
-    inputs: a tensor of size [batch_size, height, width, channels].
+    inputs: A 4-D `Tensor` of type `float` and shape
+      `[batch, height, width, in_channels]` for `NHWC` data format or
+      `[batch, in_channels, height, width]` for `NCHW` data format.
     num_outputs: integer, the number of output filters.
     kernel_size: a list of length 2 holding the [kernel_height, kernel_width] of
       of the filters. Can be an int if both values are the same.
@@ -969,6 +972,7 @@
       Can be an int if both strides are the same.  Note that presently
       both strides must have the same value.
     padding: one of 'VALID' or 'SAME'.
+    data_format: A string. `NHWC` (default) and `NCHW` are supported.
     activation_fn: activation function, set to None to skip it and maintain
       a linear activation.
     normalizer_fn: normalization function to use instead of `biases`. If
@@ -993,14 +997,23 @@
 
   Raises:
     ValueError: if 'kernel_size' is not a list of length 2.
+    ValueError: if `data_format` is neither `NHWC` nor `NCHW`.
+    ValueError: if `C` dimension of `inputs` is None.
   """
   with variable_scope.variable_scope(
       scope, 'Conv2d_transpose', [inputs], reuse=reuse) as sc:
+    if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
+      raise ValueError('data_format has to be either NCHW or NHWC.')
     dtype = inputs.dtype.base_dtype
     kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
     stride_h, stride_w = utils.two_element_tuple(stride)
-    num_filters_in = utils.last_dimension(
-        inputs.get_shape(), min_rank=4)
+    if data_format == DATA_FORMAT_NCHW:
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      h_axis, w_axis, c_axis = 1, 2, 3
+    num_filters_in = inputs.get_shape()[c_axis].value
+    if num_filters_in is None:
+      raise ValueError('`C` dimension of `inputs` must be known but is None.')
     weights_shape = [kernel_h, kernel_w, num_outputs, num_filters_in]
     weights_collections = utils.get_variable_collections(
         variables_collections, 'weights')
@@ -1015,7 +1028,7 @@
 
     inputs_shape = array_ops.shape(inputs)
     batch_size = inputs_shape[0]
-    height, width = inputs_shape[1], inputs_shape[2]
+    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
 
     def get_deconv_dim(dim_size, stride_size, kernel_size, padding):
       if isinstance(dim_size, ops.Tensor):
@@ -1031,17 +1044,25 @@
     out_height = get_deconv_dim(height, stride_h, kernel_h, padding)
     out_width = get_deconv_dim(width, stride_w, kernel_w, padding)
 
-    output_shape = array_ops.pack(
-        [batch_size, out_height, out_width, num_outputs])
+    if data_format == DATA_FORMAT_NHWC:
+      output_shape = [batch_size, out_height, out_width, num_outputs]
+      strides = [1, stride_h, stride_w, 1]
+    else:
+      output_shape = [batch_size, num_outputs, out_height, out_width]
+      strides = [1, 1, stride_h, stride_w]
+
+
+    output_shape = array_ops.pack(output_shape)
     outputs = nn.conv2d_transpose(inputs, weights, output_shape,
-                                  [1, stride_h, stride_w, 1],
-                                  padding=padding)
+                                  strides,
+                                  padding=padding,
+                                  data_format=data_format)
 
     # Infer the static output shape:
     out_shape = inputs.get_shape().as_list()
-    out_shape[-1] = num_outputs
-    out_shape[1] = get_deconv_dim(out_shape[1], stride_h, kernel_h, padding)
-    out_shape[2] = get_deconv_dim(out_shape[2], stride_w, kernel_w, padding)
+    out_shape[c_axis] = num_outputs
+    out_shape[h_axis] = get_deconv_dim(out_shape[h_axis], stride_h, kernel_h, padding)
+    out_shape[w_axis] = get_deconv_dim(out_shape[w_axis], stride_w, kernel_w, padding)
     outputs.set_shape(out_shape)
 
     if normalizer_fn is not None:
@@ -1057,7 +1078,7 @@
                                           initializer=biases_initializer,
                                           regularizer=biases_regularizer,
                                           collections=biases_collections)
-        outputs = nn.bias_add(outputs, biases)
+        outputs = nn.bias_add(outputs, biases, data_format=data_format)
 
     if activation_fn is not None:
       outputs = activation_fn(outputs)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 0ee6eb1..ff0d0a2 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -588,6 +588,175 @@
 
 class Convolution2dTransposeTests(tf.test.TestCase):
 
+  def testInvalidDataFormat(self):
+    height, width = 7, 9
+    with self.test_session():
+      images = tf.random_uniform((5, height, width, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'data_format has to be either NCHW or NHWC.'):
+        tf.contrib.layers.convolution2d_transpose(
+            images, 32, 3, data_format='CHWN')
+
+
+  def testOutputSizeWithStrideOneSamePaddingNCHW(self):
+    # `NCHW` data fomat is only supported for `GPU` device.
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 32
+        input_size = [5, 3, 10, 12]
+        expected_size = [5, num_filters, 10, 12]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [3, 3], stride=1,
+            padding='SAME', data_format='NCHW')
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+
+        sess.run(tf.initialize_all_variables())
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+
+  def testOutputSizeWithStrideOneValidPaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 32
+        input_size = [5, 3, 10, 12]
+        expected_size = [5, num_filters, 12, 14]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [3, 3], stride=1,
+            padding='VALID', data_format='NCHW')
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+
+        sess.run(tf.initialize_all_variables())
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWithStrideTwoValidPaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 32
+        input_size = [5, 3, 9, 11]
+        expected_size = [5, num_filters, 19, 23]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [3, 3], stride=[2, 2],
+            padding='VALID', data_format='NCHW')
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.get_shape().as_list()), expected_size)
+
+        sess.run(tf.initialize_all_variables())
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWith1x1StrideTwoSamePaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 1, 1]
+        expected_size = [1, num_filters, 2, 2]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 2], stride=[2, 2],
+            padding='SAME', data_format='NCHW')
+        self.assertListEqual(list(output.get_shape().as_list()), expected_size)
+
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWith1x1StrideTwoValidPaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 1, 1]
+        expected_size = [1, num_filters, 2, 2]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 2], stride=[2, 2],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWith2x2StrideTwoSamePaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 2, 2]
+        expected_size = [1, num_filters, 4, 4]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 2], stride=[2, 2],
+            padding='SAME', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWith2x2StrideTwoValidPaddingNCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 2, 2]
+        expected_size = [1, num_filters, 4, 4]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 2], stride=[2, 2],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWithStride2x1NCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 3, 2]
+        expected_size = [1, num_filters, 6, 5]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 4], stride=[2, 1],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWithStride2x4NCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 3, 2]
+        expected_size = [1, num_filters, 6, 8]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 4], stride=[2, 4],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+  def testOutputSizeWithStride2x5NCHW(self):
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        num_filters = 1
+        input_size = [1, 1, 3, 2]
+        expected_size = [1, num_filters, 6, 10]
+
+        images = tf.random_uniform(input_size, seed=1)
+        output = tf.contrib.layers.conv2d_transpose(
+            images, num_filters, [2, 4], stride=[2, 5],
+            padding='VALID', data_format='NCHW')
+        sess.run(tf.initialize_all_variables())
+        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
+        self.assertListEqual(list(output.eval().shape), expected_size)
+
+
   def testOutputSizeWithStrideOneSamePadding(self):
     num_filters = 32
     input_size = [5, 10, 12, 3]
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index 2924fd6..8491bb7 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -244,7 +244,7 @@
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
 
       self.assertAllEqual(session.run(inputs), [b"ABC"])
       self.assertAllEqual(session.run(inputs), [b"DEF"])
@@ -253,6 +253,7 @@
         session.run(inputs)
 
       coord.request_stop()
+      coord.join(threads)
 
   def test_read_keyed_batch_features_mutual_exclusive_args(self):
     filename = self._create_temp_file("abcde")
@@ -307,6 +308,7 @@
         coord.request_stop()
 
       coord.join(threads)
+
     parsed_records = [item for sublist in [d["sequence"] for d in data]
                       for item in sublist]
     # Check that the number of records matches expected and all records
@@ -331,7 +333,7 @@
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
 
       self.assertEqual("%s:1" % name, inputs.name)
       file_name_queue_name = "%s/file_name_queue" % name
@@ -352,6 +354,7 @@
         session.run(inputs)
 
       coord.request_stop()
+      coord.join(threads)
 
   def test_read_text_lines_multifile_with_shared_queue(self):
     gfile.Glob = self._orig_glob
@@ -375,7 +378,7 @@
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
 
       self.assertEqual("%s:1" % name, inputs.name)
       shared_file_name_queue_name = "%s/file_name_queue" % name
@@ -398,6 +401,7 @@
         session.run(inputs)
 
       coord.request_stop()
+      coord.join(threads)
 
   def _get_qr(self, name):
     for qr in ops.get_collection(ops.GraphKeys.QUEUE_RUNNERS):
@@ -490,7 +494,7 @@
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
 
       self.assertAllEqual(session.run(inputs), [b"A", b"B", b"C"])
       self.assertAllEqual(session.run(inputs), [b"D", b"E"])
@@ -498,6 +502,7 @@
         session.run(inputs)
 
       coord.request_stop()
+      coord.join(threads)
 
   def test_keyed_read_text_lines(self):
     gfile.Glob = self._orig_glob
@@ -517,7 +522,7 @@
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
 
       self.assertAllEqual(session.run([keys, inputs]),
                           [[filename.encode("utf-8") + b":1"], [b"ABC"]])
@@ -529,6 +534,7 @@
         session.run(inputs)
 
       coord.request_stop()
+      coord.join(threads)
 
   def test_keyed_parse_json(self):
     gfile.Glob = self._orig_glob
@@ -557,7 +563,7 @@
       session.run(tf.initialize_local_variables())
 
       coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
 
       key, age = session.run([keys, inputs["age"]])
       self.assertAllEqual(age, [[0]])
@@ -572,6 +578,7 @@
         session.run(inputs)
 
       coord.request_stop()
+      coord.join(threads)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/learn/python/learn/utils/export_test.py b/tensorflow/contrib/learn/python/learn/utils/export_test.py
index 0f1c7e6..329a486 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export_test.py
@@ -21,6 +21,7 @@
 
 import os
 import random
+import six
 import tempfile
 
 import numpy as np
@@ -63,8 +64,8 @@
     # Only the written checkpoints are exported.
     self.assertTrue(tf.gfile.Exists(export_dir + '00000001/export'))
     self.assertTrue(tf.gfile.Exists(export_dir + '00000010/export'))
-    self.assertEquals(export_monitor.last_export_dir, os.path.join(export_dir,
-                                                                   '00000010'))
+    self.assertEquals(export_monitor.last_export_dir,
+                      six.b(os.path.join(export_dir, '00000010')))
     # Validate the signature
     signature = self._get_default_signature(export_dir + '00000010/export.meta')
     self.assertTrue(signature.HasField('regression_signature'))
@@ -86,8 +87,8 @@
     # Only the written checkpoints are exported.
     self.assertTrue(tf.gfile.Exists(export_dir + '00000001/export'))
     self.assertTrue(tf.gfile.Exists(export_dir + '00000010/export'))
-    self.assertEquals(export_monitor.last_export_dir, os.path.join(export_dir,
-                                                                   '00000010'))
+    self.assertEquals(export_monitor.last_export_dir,
+                      six.b(os.path.join(export_dir, '00000010')))
     # Validate the signature
     signature = self._get_default_signature(export_dir + '00000010/export.meta')
     self.assertTrue(signature.HasField('generic_signature'))
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index c13f67f..0b528cb 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -351,6 +351,10 @@
   inline int Log2FloorNonZero(uint64 n) {
 #if defined(__GNUC__)
     return 63 ^ __builtin_clzll(n);
+#elif defined(PLATFORM_WINDOWS)
+    unsigned long index;
+    _BitScanReverse64(&index, n);
+    return index;
 #else
     int r = 0;
     while (n > 0) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 30f1a28..e9c48a3 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -873,7 +873,9 @@
   if (visible_device_list.empty()) {
     visible_gpu_order.resize(gpu_manager->VisibleDeviceCount());
     // By default, visible to virtual mapping is unchanged.
-    std::iota(visible_gpu_order.begin(), visible_gpu_order.end(), 0);
+    int deviceNo = 0;
+    std::generate(visible_gpu_order.begin(), visible_gpu_order.end(),
+	              [&deviceNo]{ return deviceNo++; });
   } else {
     std::vector<string> order_str = str_util::Split(visible_device_list, ',');
     for (int i = 0; i < order_str.size(); ++i) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
index 82d8b71..ee93b19 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
@@ -254,6 +254,10 @@
   return manager;
 }
 
+#ifdef _MSC_VER
+#define __thread __declspec(thread) 
+#endif
+
 // TODO(pbar) Move this to platform specific header file?
 // Static thread local variable for POD types.
 #define TF_STATIC_THREAD_LOCAL_POD(_Type_, _var_)                  \
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.cc b/tensorflow/core/common_runtime/gpu/pool_allocator.cc
index e0362b3..700ac34 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.cc
@@ -16,8 +16,10 @@
 #include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
 
 #include <errno.h>
+#ifndef _MSC_VER
 #include <strings.h>
 #include <sys/mman.h>  // for munmap
+#endif
 
 #include <map>
 #include <utility>
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index d7e72df..ea9b42f 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -126,7 +126,7 @@
     gpu::StreamExecutor* se =
         gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
     int bus_id = se->GetDeviceDescription().numa_node();
-    if (bus_id < static_cast<int64>(gpu_visitors_.size())) {
+    if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
       for (auto v : gpu_visitors_[bus_id]) {
         gpu_allocators_[gpu_id]->AddAllocVisitor(v);
       }
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 35e009c..4f8eb04 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -152,7 +152,7 @@
   // allocated by this allocator.
   virtual size_t RequestedSize(void* ptr) {
     CHECK(false) << "allocator doesn't track sizes";
-    return 0;
+    return size_t(0);
   }
 
   // Returns the allocated size of the buffer at 'ptr' if known,
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index acba116..8f0075d 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -149,6 +149,7 @@
   // attributes requested.  See allocator.h for more details.
   virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
     LOG(FATAL) << "GetAllocator() is not implemented.";
+    return nullptr;
   }
 
   // Return the Allocator implementation to use based on the allocator
@@ -180,6 +181,8 @@
 
   virtual const DeviceAttributes& attributes() const {
     LOG(FATAL) << "Device does not implement attributes()";
+    static DeviceAttributes dummy;
+    return dummy;
   }
 
   // Materializes the given TensorProto into 'tensor' stored in Device
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 0776a1c..4d8d378 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -349,6 +349,15 @@
 
 TEST(Tensor_Scalar, Basics) {
   {
+    Tensor t(DT_BOOL, TensorShape({}));
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<bool>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    t.scalar<bool>()() = true;
+    EXPECT_TRUE(Tt());
+  }
+  {
     Tensor t(DT_FLOAT, TensorShape({}));
     EXPECT_EQ(1, t.NumElements());
     auto Tt = t.scalar<float>();
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
index d71fdac..a54dbdf 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -16,6 +16,7 @@
 #if GOOGLE_CUDA
 
 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace functor {
@@ -32,6 +33,28 @@
 };
 
 template <typename T>
+struct SelectScalarFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<bool>::ConstScalar cond,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat) {
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  Eigen::array<int, 1> rank1{1};
+#else
+  Eigen::IndexList<Eigen::type2index<1>> rank1;
+#endif
+  const int size  = then_flat.dimension(0);
+  Eigen::array<int, 1> broadcast_dims{size};
+
+  To32Bit(out).device(d) = cond.reshape(rank1)
+                               .broadcast(broadcast_dims)
+                               .select(then_flat, else_flat);
+
+  }
+};
+
+template <typename T>
 struct BatchSelectFunctor<GPUDevice, T> {
   void operator()(const GPUDevice& d,
                   typename TTypes<T>::Matrix output_flat_outer_dims,
@@ -68,6 +91,7 @@
 
 #define SELECT_FUNCTOR(T)                      \
   template struct SelectFunctor<GPUDevice, T>; \
+  template struct SelectScalarFunctor<GPUDevice, T>; \
   template struct BatchSelectFunctor<GPUDevice, T>;
 
 SELECT_FUNCTOR(Eigen::half);
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index fbfde88..8160fb7 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -41,6 +41,11 @@
     OP_REQUIRES_OK(ctx, ctx->input("t", &then));
     OP_REQUIRES_OK(ctx, ctx->input("e", &else_));
 
+    if (TensorShapeUtils::IsScalar(cond->shape())){
+        ComputeScalar(ctx, cond, then, else_);
+        return;
+    }
+
     bool broadcasting = (TensorShapeUtils::IsVector(cond->shape()) &&
                          !TensorShapeUtils::IsVector(then->shape()));
 
@@ -108,6 +113,25 @@
     }
   }
 
+  void ComputeScalar(OpKernelContext* ctx, const Tensor* cond,
+                          const Tensor* then, const Tensor* else_) {
+    OP_REQUIRES(
+        ctx, then->shape().IsSameSize(else_->shape()),
+        errors::InvalidArgument(
+            "'then' and 'else' must have the same size.  but received: ",
+            then->shape().DebugString(), " vs. ",
+            else_->shape().DebugString()));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+
+    if (output->NumElements() > 0) {
+      functor::SelectScalarFunctor<Device, T> func;
+      TTypes<bool>::ConstScalar cond_scalar = cond->scalar<bool>();
+      func(ctx->eigen_device<Device>(), output->flat<T>(), cond_scalar,
+           then->flat<T>(), else_->flat<T>());
+    }
+  }
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
 };
@@ -152,6 +176,17 @@
   }
 };
 
+// CPU Specializations of Select functors with scalar
+template <typename T>
+struct SelectScalarFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
+                  TTypes<bool>::ConstScalar cond,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat) {
+    out.device(d) = cond() ? then_flat : else_flat;
+  }
+};
+
 template <typename T>
 struct BatchSelectFunctor<CPUDevice, T> {
   void operator()(const CPUDevice& d,
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 2a77376..572a729 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -720,6 +720,14 @@
 };
 
 template <typename Device, typename T>
+struct SelectScalarFunctor {
+  void operator()(const Device& d, typename TTypes<T>::Flat out,
+                  typename TTypes<bool>::ConstScalar cond,
+                  typename TTypes<T>::ConstFlat then_flat,
+                  typename TTypes<T>::ConstFlat else_flat);
+};
+
+template <typename Device, typename T>
 struct BatchSelectFunctor {
   void operator()(const Device& d,
                   typename TTypes<T>::Matrix output_flat_outer_dims,
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 0acf82c..b256d24 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -21,7 +21,11 @@
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
+#if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
+#else
+#define UNROLL 
+#endif
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op.cc b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
index 09f75f2..5f30a95 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@@ -25,8 +25,25 @@
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
+#if GOOGLE_CUDA
+namespace {
+template <typename Scalar>
+perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
+    const Scalar* cuda_memory) {
+  perftools::gputools::DeviceMemoryBase wrapped(
+      const_cast<Scalar*>(cuda_memory));
+  perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
+  return typed;
+}
+}  // namespace
+#endif  // GOOGLE_CUDA
+
 template <class Scalar>
 class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
  public:
@@ -60,7 +77,9 @@
   int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
     double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
     double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
-    double cost = rows * rows * num_rhss;
+    double cost = rows * rows * num_rhss * 
+          (Eigen::TensorOpCost::AddCost<Scalar>() + 
+           Eigen::TensorOpCost::MulCost<Scalar>());
     return cost >= static_cast<double>(kint64max) ? kint64max
                                                   : static_cast<int64>(cost);
   }
@@ -103,6 +122,121 @@
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOp);
 };
 
+
+#ifdef GOOGLE_CUDA
+template <class Scalar>
+class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
+ public:
+  typedef LinearAlgebraOp<Scalar> Base;
+
+  explicit MatrixTriangularSolveOpGPU(OpKernelConstruction* context)
+      : Base(context), lower_(true), adjoint_(false) {
+    OP_REQUIRES_OK(context, context->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMap = typename Base::MatrixMap;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  virtual void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSquareSolver(context, input_matrix_shapes);
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({TensorShape({input_matrix_shapes[0].dim_size(1),
+                                      input_matrix_shapes[1].dim_size(1)})});
+  }
+
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
+    double cost = rows * rows * num_rhss * 
+          (Eigen::TensorOpCost::AddCost<Scalar>() + 
+           Eigen::TensorOpCost::MulCost<Scalar>());
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& matrix = inputs[0];
+    const ConstMatrixMap& rhs = inputs[1];
+    MatrixMap& output = outputs->at(0);
+
+    if (matrix.rows() == 0 || rhs.cols() == 0) {
+      // To be consistent with the MatrixInverse op, we define the solution for
+      // an empty set of equation as the empty matrix.
+      return;
+    }
+
+    auto matrix_ptr = AsDeviceMemory(matrix.data());
+    auto rhs_ptr = AsDeviceMemory(rhs.data());
+    auto out_ptr = AsDeviceMemory(output.data());
+
+    auto* stream = context->op_device_context()->stream();
+    uint64 rhs_elems = rhs.rows() * rhs.cols();
+    bool copy_status =
+        stream->ThenMemcpyD2D(&out_ptr, rhs_ptr, sizeof(Scalar) * rhs_elems)
+        .ok();
+    if (!copy_status) {
+      context->SetStatus(
+          errors::Internal("Failed to copy rhs into output before solve"));
+    }
+
+    // Cublas does
+    // output = matrix \ rhs
+    // where matrix, rhs and output are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // output' = rhs' / matrix' (' stands for transpose)
+    // Upper/lower needs to be swapped for this.
+
+    perftools::gputools::blas::UpperLower upper_lower_matrix;
+    perftools::gputools::blas::Transpose transpose_matrix;
+    if (lower_) {
+      upper_lower_matrix = perftools::gputools::blas::UpperLower::kUpper;
+    } else {
+      upper_lower_matrix = perftools::gputools::blas::UpperLower::kLower;
+    }
+    if (adjoint_) {
+      transpose_matrix = perftools::gputools::blas::Transpose::kTranspose;
+    } else {
+      transpose_matrix = perftools::gputools::blas::Transpose::kNoTranspose;
+    }
+    uint64 leading_dim_matrix = matrix.cols();   
+    uint64 leading_dim_output = output.cols();      
+    uint64 colmajor_rows = output.cols(); 
+    uint64 colmajor_cols = output.rows(); 
+    bool blas_launch_status =
+      stream
+        ->ThenBlasTrsm(perftools::gputools::blas::Side::kRight /*side*/, 
+                       upper_lower_matrix /*uplo*/, 
+                       transpose_matrix /*trans*/,
+                       perftools::gputools::blas::Diagonal::kNonUnit /*diag*/,
+                       colmajor_rows /*m*/, colmajor_cols /*n*/, 
+                       Scalar(1.0) /*alpha*/, 
+                       matrix_ptr, leading_dim_matrix /*lda*/, 
+                       &out_ptr, leading_dim_output /*ldb*/)
+        .ok();
+    if (!blas_launch_status) {
+      context->SetStatus(errors::Internal("Blas TRSM launch failed"));
+    }
+  }
+
+ private:
+  bool lower_;
+  bool adjoint_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOpGPU);
+};
+#endif  // GOOGLE_CUDA
+
 REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<float>),
                    float);
 REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<double>),
@@ -112,4 +246,30 @@
 REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
                    (MatrixTriangularSolveOp<double>), double);
 
+#ifdef GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("MatrixTriangularSolve")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<float>("T"),
+    MatrixTriangularSolveOpGPU<float>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("MatrixTriangularSolve")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<double>("T"),
+    MatrixTriangularSolveOpGPU<double>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("BatchMatrixTriangularSolve")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<float>("T"),
+    MatrixTriangularSolveOpGPU<float>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("BatchMatrixTriangularSolve")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<double>("T"),
+    MatrixTriangularSolveOpGPU<double>);
+#endif  //GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
index 1372975..3010666 100644
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -115,10 +115,12 @@
 
   int64 Sample(random::SimplePhilox* rnd) const override {
     LOG(FATAL) << "Should not be called";
+    return 0;
   }
 
   float Probability(int64 value) const override {
     LOG(FATAL) << "Should not be called";
+    return 0;
   }
 
   void SampleBatchGetExpectedCountAvoid(
diff --git a/tensorflow/core/lib/io/path.cc b/tensorflow/core/lib/io/path.cc
index c59d6d1..de49d07 100644
--- a/tensorflow/core/lib/io/path.cc
+++ b/tensorflow/core/lib/io/path.cc
@@ -55,7 +55,10 @@
 // the first part of the output.
 std::pair<StringPiece, StringPiece> SplitPath(StringPiece path) {
   auto pos = path.rfind('/');
-
+#ifdef PLATFORM_WINDOWS
+  if (pos == StringPiece::npos)
+    pos = path.rfind('\\');
+#endif
   // Handle the case with no '/' in 'path'.
   if (pos == StringPiece::npos)
     return std::make_pair(StringPiece(path.data(), 0), path);
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index cf39d2f..8d3d931 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -913,7 +913,8 @@
     .SetShapeFn([](InferenceContext* c) {
       // The inputs 'then' and 'else' must have the same shape.
       ShapeHandle data = c->input(1);
-      TF_RETURN_IF_ERROR(c->Merge(data, c->input(2), &data));
+      ShapeHandle other = c->input(2);
+      TF_RETURN_IF_ERROR(c->Merge(data, other, &data));
 
       // The input 'cond' must either have the same shape as 'then' and
       // 'else', or be a vector if 'then' and 'else' are at least vectors.
@@ -929,30 +930,49 @@
       const int32 cond_rank = c->Rank(cond);
       const int32 data_rank = c->Rank(data);
 
-      if (cond_rank != 1) {
-        // If the rank of 'cond' is != 1, the shape must match 'then' and 'else'
-        TF_RETURN_IF_ERROR(c->Merge(data, cond, &data));
+      if (cond_rank == 0){
+        // The rank of 'cond' is a scalar.
+        // t and e can have any shape.
+        c->set_output(0, data);
+        return Status::OK();
       }
-      if (data_rank != 0) {
-        // If then and else are not scalars, then cond must be at least
-        // a vector, and its first value must match that of 'else'
-        TF_RETURN_IF_ERROR(c->WithRankAtLeast(cond, 1, &cond));
-        if (cond_rank == 1) {
-          TF_RETURN_IF_ERROR(c->Merge(cond, c->Vector(c->Dim(data, 0)), &cond));
-        }
+
+      if (cond_rank != 1) {
+        // If 'cond' is not a vector, and not a scalar,
+        // then shape must match 'then' and 'else'
+        TF_RETURN_IF_ERROR(c->Merge(data, cond, &data));
+        c->set_output(0, data);
+        return Status::OK();
+      }
+
+      if (data_rank == 0) {
+        // if 'then' and 'else' are scalar also the cond must be
+        TF_RETURN_IF_ERROR(c->Merge(data, cond, &data));
+        c->set_output(0, data);
+        return Status::OK();
+      }
+
+      if (cond_rank == 1) {
+        // if the cond is a vector and the 'then' is not a scalar,
+        // the first dimension of 'then' and 'else'
+        TF_RETURN_IF_ERROR(c->Merge(cond, c->Vector(c->Dim(data, 0)), &cond));
+        c->set_output(0, data);
+        return Status::OK();
       }
 
       c->set_output(0, data);
       return Status::OK();
-    })
+   })
     .Doc(R"doc(
 Selects elements from `t` or `e`, depending on `condition`.
 
-The `t`, and `e` tensors must all have the same shape,
-and the output will also have that shape.  The `condition` tensor
-must be a scalar if `t` and `e` are scalars.  If `t` and `e` are vectors
-or higher rank, then `condition` must be either a vector with size
-matching the first dimension of `t`, or must have the same shape as `t`.
+The `t`, and `e` tensors must all have the same shape, and the
+output will also have that shape.
+
+The `condition` tensor must be a scalar if `t` and `e` are scalars.
+If `t` and `e` are vectors or higher rank, then `condition` must be either a
+scalar, a vector with size matching the first dimension of `t`, or must have
+the same shape as `t`.
 
 The `condition` tensor acts as a mask that chooses, based on the value at each
 element, whether the corresponding element / row in the output should be
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index edcd09a..79ae187 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -188,7 +188,10 @@
   ShapeInferenceTestOp op("Select");
   INFER_OK(op, "?;?;?", "in1|in2");
 
+  // scalar case
+  INFER_OK(op, "[];[1];?", "in1");
   INFER_OK(op, "[];?;?", "in1|in2");
+
   INFER_OK(op, "[1];?;?",
            "in1|in2");  // When cond is vector, t/e may not match it.
   INFER_OK(op, "[1,2];?;?", "in1|in2?");
@@ -200,8 +203,8 @@
   INFER_OK(op, "?;[1,2];?", "in1");
   INFER_OK(op, "?;?;[1,2]", "in2");
 
-  INFER_OK(op, "[1];[];?", "in1");
-  INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op, "[];[1];?");
+  INFER_ERROR("Shapes must be equal rank, but are 0 and 1", op, "[1];[];?");
+  INFER_ERROR("Shapes must be equal rank, but are 1 and 2", op, "[];[1];[1,2]");
   INFER_ERROR("Shapes must be equal rank, but are 1 and 2", op, "[1,2];[1];?");
   INFER_OK(op, "[2];[?];[?]", "in1|in2");
 
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.h b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
index e482f86..38e01ce 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@@ -20,9 +20,11 @@
 
 #include <stddef.h>
 #include <stdint.h>
-
+#if defined(WIN32)
+#include "extras/CUPTI/include/cupti.h"
+#else
 #include "cuda/extras/CUPTI/include/cupti.h"
-
+#endif
 namespace perftools {
 namespace gputools {
 namespace profiler {
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 1d6928b..3aaf3a5 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -261,6 +261,14 @@
   virtual Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
                                       void** symbol) = 0;
 
+  // \brief build the name of dynamic library.
+  //
+  // "name" should be name of the library.
+  // "version" should be the version of the library or NULL
+  // returns the name that LoadLibrary() can use
+  virtual string FormatLibraryFileName(const string& name,
+      const string& version) = 0;
+
  private:
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
   TF_DISALLOW_COPY_AND_ASSIGN(Env);
@@ -318,7 +326,10 @@
                               void** symbol) override {
     return target_->GetSymbolFromLibrary(handle, symbol_name, symbol);
   }
-
+  string FormatLibraryFileName(const string& name,
+                               const string& version) override {
+    return target_->FormatLibraryFileName(name, version);
+  }
  private:
   Env* target_;
 };
diff --git a/tensorflow/core/platform/load_library.h b/tensorflow/core/platform/load_library.h
index 850ca9f..9038de2 100644
--- a/tensorflow/core/platform/load_library.h
+++ b/tensorflow/core/platform/load_library.h
@@ -25,8 +25,6 @@
 Status LoadLibrary(const char* library_filename, void** handle);
 Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
                             void** symbol);
-// Return the filename of a dynamically linked library formatted according to
-// platform naming conventions
 string FormatLibraryFileName(const string& name, const string& version);
 
 }  // namespace internal
diff --git a/tensorflow/core/platform/platform.h b/tensorflow/core/platform/platform.h
index 982a7b8..55d7954 100644
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@@ -20,7 +20,8 @@
 // mobile.
 
 #if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) && \
-    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID)
+    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID) && \
+    !defined(PLATFORM_WINDOWS)
 
 // Choose which platform we are on.
 #if defined(ANDROID) || defined(__ANDROID__)
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 75e300a..2f9c8e4 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -119,6 +119,10 @@
     return tensorflow::internal::GetSymbolFromLibrary(handle, symbol_name,
                                                       symbol);
   }
+
+  string FormatLibraryFileName(const string& name, const string& version) {
+    return tensorflow::internal::FormatLibraryFileName(name, version);
+  }
 };
 
 }  // namespace
diff --git a/tensorflow/core/platform/stacktrace.h b/tensorflow/core/platform/stacktrace.h
index beb97b6..3c953c9 100644
--- a/tensorflow/core/platform/stacktrace.h
+++ b/tensorflow/core/platform/stacktrace.h
@@ -22,7 +22,7 @@
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/stacktrace.h"
 #elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
-    defined(PLATFORM_GOOGLE_ANDROID)
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
 #include "tensorflow/core/platform/default/stacktrace.h"
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 09edc10..41ce5d9 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -26,6 +26,7 @@
 
 #include <thread>
 #include <vector>
+#include <string>
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/load_library.h"
@@ -52,7 +53,20 @@
 
 class WindowsEnv : public Env {
  public:
-  WindowsEnv() {}
+  WindowsEnv()
+      : GetSystemTimePreciseAsFileTime_(NULL) {
+    // GetSystemTimePreciseAsFileTime function is only available in the latest
+    // versions of Windows. For that reason, we try to look it up in
+    // kernel32.dll at runtime and use an alternative option if the function
+    // is not available.
+    HMODULE module = GetModuleHandle("kernel32.dll");
+    if (module != NULL) {
+      auto func = (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
+          module, "GetSystemTimePreciseAsFileTime");
+      GetSystemTimePreciseAsFileTime_ = func;
+    }
+  }
+
   ~WindowsEnv() override {
     LOG(FATAL) << "Env::Default() must not be destroyed";
   }
@@ -62,11 +76,32 @@
   }
 
   uint64 NowMicros() override {
-    FILETIME temp;
-    GetSystemTimeAsFileTime(&temp);
-    uint64 now_ticks =
-        (uint64)temp.dwLowDateTime + ((uint64)(temp.dwHighDateTime) << 32LL);
-    return now_ticks / 10LL;
+    if (GetSystemTimePreciseAsFileTime_ != NULL) {
+      // GetSystemTimePreciseAsFileTime function is only available in latest
+      // versions of Windows, so we need to check for its existence here.
+      // All std::chrono clocks on Windows proved to return
+      // values that may repeat, which is not good enough for some uses.
+      constexpr int64_t kUnixEpochStartTicks = 116444736000000000i64;
+      constexpr int64_t kFtToMicroSec = 10;
+
+      // This interface needs to return system time and not
+      // just any microseconds because it is often used as an argument
+      // to TimedWait() on condition variable
+      FILETIME system_time;
+      GetSystemTimePreciseAsFileTime_(&system_time);
+
+      LARGE_INTEGER li;
+      li.LowPart = system_time.dwLowDateTime;
+      li.HighPart = system_time.dwHighDateTime;
+      // Subtract unix epoch start
+      li.QuadPart -= kUnixEpochStartTicks;
+      // Convert to microsecs
+      li.QuadPart /= kFtToMicroSec;
+      return li.QuadPart;
+    }
+    using namespace std::chrono;
+    return duration_cast<microseconds>(
+        system_clock::now().time_since_epoch()).count();
   }
 
   void SleepForMicroseconds(int64 micros) override { Sleep(micros / 1000); }
@@ -94,19 +129,53 @@
     });
   }
 
-  Status LoadLibrary(const char* library_filename, void** handle) override {
-    return errors::Unimplemented("WindowsEnv::LoadLibrary");
+  Status LoadLibrary(const char *library_filename, void** handle) override {
+    std::string file_name = library_filename;
+    std::replace(file_name.begin(), file_name.end(), '/', '\\');
+
+    HMODULE hModule = LoadLibraryEx(file_name.c_str(), NULL,
+      LOAD_WITH_ALTERED_SEARCH_PATH);
+    if (!hModule) {
+      return errors::NotFound(file_name + " not found");
+    }
+    *handle = hModule;
+    return Status::OK();
   }
 
   Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-                              void** symbol) override {
-    return errors::Unimplemented("WindowsEnv::GetSymbolFromLibrary");
+    void** symbol) override {
+    FARPROC found_symbol;
+
+    found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+    if (found_symbol == NULL) {
+      return errors::NotFound(std::string(symbol_name) + " not found");
+    }
+    *symbol = (void **)found_symbol;
+    return Status::OK();
   }
+
+  string FormatLibraryFileName(const string& name, const string& version)
+    override {
+    string filename;
+    if (version.size() == 0) {
+      filename = name + ".dll";
+    }
+    else {
+      filename = name + version + ".dll";
+    }
+    return filename;
+  }
+
+ private:
+  typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
+  FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
 
 }  // namespace
 
 REGISTER_FILE_SYSTEM("", WindowsFileSystem);
+REGISTER_FILE_SYSTEM("file", LocalWinFileSystem);
+
 Env* Env::Default() {
   static Env* default_env = new WindowsEnv;
   return default_env;
diff --git a/tensorflow/core/platform/windows/error.cc b/tensorflow/core/platform/windows/error.cc
new file mode 100644
index 0000000..39e941a
--- /dev/null
+++ b/tensorflow/core/platform/windows/error.cc
@@ -0,0 +1,33 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/windows/error.h"
+
+namespace tensorflow {
+namespace internal {
+
+std::string GetWindowsErrorMessage(DWORD err) {
+  LPSTR buffer = NULL;
+  DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+      FORMAT_MESSAGE_IGNORE_INSERTS;
+  FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+                 reinterpret_cast<LPSTR>(&buffer), 0, NULL);
+  std::string message = buffer;
+  LocalFree(buffer);
+  return message;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/error.h b/tensorflow/core/platform/windows/error.h
new file mode 100644
index 0000000..026e0d5
--- /dev/null
+++ b/tensorflow/core/platform/windows/error.h
@@ -0,0 +1,32 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
+#define TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
+
+#include <string>
+
+#include <Windows.h>
+
+namespace tensorflow {
+namespace internal {
+
+std::string GetWindowsErrorMessage(DWORD err);
+
+}
+}
+
+#endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
+
diff --git a/tensorflow/core/platform/windows/net.cc b/tensorflow/core/platform/windows/net.cc
index fbc0c39..46eb072 100644
--- a/tensorflow/core/platform/windows/net.cc
+++ b/tensorflow/core/platform/windows/net.cc
@@ -15,25 +15,27 @@
 
 #include "tensorflow/core/platform/net.h"
 
-#include <cerrno>
 #include <cstdlib>
 #include <unordered_set>
 
 #include <sys/types.h>
-#include <winsock.h>
+#include <winsock2.h>
 
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/windows/error.h"
 
 #undef ERROR
 
+#pragma comment(lib,"Ws2_32.lib")
+
 namespace tensorflow {
 namespace internal {
 
 namespace {
+
 bool IsPortAvailable(int* port, bool is_tcp) {
   const int protocol = is_tcp ? IPPROTO_TCP : 0;
-  const int fd = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);
+  SOCKET sock = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);
 
   struct sockaddr_in addr;
   int addr_len = static_cast<int>(sizeof(addr));
@@ -41,17 +43,20 @@
 
   CHECK_GE(*port, 0);
   CHECK_LE(*port, 65535);
-  if (fd < 0) {
-    LOG(ERROR) << "socket() failed: " << strerror(errno);
+  if (sock == INVALID_SOCKET) {
+    LOG(ERROR) << "socket() failed: " <<
+        GetWindowsErrorMessage(WSAGetLastError());
     return false;
   }
 
-  // SO_REUSEADDR lets us start up a server immediately after it exists.
-  int one = 1;
-  if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (const char*)&one, sizeof(one)) <
-      0) {
-    LOG(ERROR) << "setsockopt() failed: " << strerror(errno);
-    closesocket(fd);
+  // SO_REUSEADDR lets us start up a server immediately after it exits.
+  const int one = 1;
+  int result = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+                          reinterpret_cast<const char*>(&one), sizeof(one));
+  if (result == SOCKET_ERROR) {
+    LOG(ERROR) << "setsockopt() failed: " <<
+        GetWindowsErrorMessage(WSAGetLastError());
+    closesocket(sock);
     return false;
   }
 
@@ -59,18 +64,23 @@
   addr.sin_family = AF_INET;
   addr.sin_addr.s_addr = INADDR_ANY;
   addr.sin_port = htons((uint16_t)*port);
-  if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
-    LOG(WARNING) << "bind(port=" << *port << ") failed: " << strerror(errno);
-    closesocket(fd);
+  result = bind(sock, (struct sockaddr*)&addr, sizeof(addr));
+  if (result == SOCKET_ERROR) {
+    LOG(WARNING) << "bind(port=" << *port << ") failed: " <<
+        GetWindowsErrorMessage(WSAGetLastError());
+    closesocket(sock);
     return false;
   }
 
   // Get the bound port number.
-  if (getsockname(fd, (struct sockaddr*)&addr, &addr_len) < 0) {
-    LOG(WARNING) << "getsockname() failed: " << strerror(errno);
-    closesocket(fd);
+  result = getsockname(sock, (struct sockaddr*)&addr, &addr_len);
+  if (result == SOCKET_ERROR) {
+    LOG(WARNING) << "getsockname() failed: " <<
+        GetWindowsErrorMessage(WSAGetLastError());
+    closesocket(sock);
     return false;
   }
+
   CHECK_LE(addr_len, sizeof(addr));
   actual_port = ntohs(addr.sin_port);
   CHECK_GT(actual_port, 0);
@@ -79,7 +89,8 @@
   } else {
     CHECK_EQ(*port, actual_port);
   }
-  closesocket(fd);
+
+  closesocket(sock);
   return true;
 }
 
@@ -89,6 +100,12 @@
 }  // namespace
 
 int PickUnusedPortOrDie() {
+  WSADATA wsaData;
+  if (WSAStartup(MAKEWORD(2, 2), &wsaData) != NO_ERROR) {
+    LOG(ERROR) << "Error at WSAStartup()";
+    return false;
+  }
+
   static std::unordered_set<int> chosen_ports;
 
   // Type of port to first pick in the next iteration.
@@ -121,6 +138,7 @@
     }
 
     chosen_ports.insert(port);
+    WSACleanup();
     return port;
   }
 
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 0721976..ee5be22 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -19,8 +19,8 @@
 #ifdef SNAPPY
 #include <snappy.h>
 #endif
-#include <WinSock2.h>
-#pragma comment(lib, "Ws2_32.lib")
+
+#include <Windows.h>
 
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/demangle.h"
@@ -37,10 +37,13 @@
 void InitMain(const char* usage, int* argc, char*** argv) {}
 
 string Hostname() {
-  char hostname[1024];
-  gethostname(hostname, sizeof hostname);
-  hostname[sizeof hostname - 1] = 0;
-  return string(hostname);
+  char name[1024];
+  DWORD name_size = sizeof(name);
+  name[0] = 0;
+  if (::GetComputerNameA(name, &name_size)) {
+    name[name_size] = 0;
+  }
+  return name;
 }
 
 int NumSchedulableCPUs() {
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 44b26d9..714bb55 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -30,6 +30,7 @@
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
+#include "tensorflow/core/platform/windows/error.h"
 #include "tensorflow/core/platform/windows/windows_file_system.h"
 
 // TODO(mrry): Prevent this Windows.h #define from leaking out of our headers.
@@ -39,19 +40,71 @@
 
 namespace {
 
+// RAII helpers for HANDLEs
+const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
+typedef std::unique_ptr<void, decltype(CloseHandleFunc)> UniqueCloseHandlePtr;
+
+inline Status IOErrorFromWindowsError(const string& context, DWORD err) {
+  return IOError(
+      context + string(" : ") + internal::GetWindowsErrorMessage(err), err);
+}
+
+// PLEASE NOTE: hfile is expected to be an async handle
+// (i.e. opened with FILE_FLAG_OVERLAPPED)
+SSIZE_T pread(HANDLE hfile, char* src, size_t num_bytes, uint64_t offset) {
+  assert(num_bytes <= std::numeric_limits<DWORD>::max());
+  OVERLAPPED overlapped = {0};
+  ULARGE_INTEGER offset_union;
+  offset_union.QuadPart = offset;
+
+  overlapped.Offset = offset_union.LowPart;
+  overlapped.OffsetHigh = offset_union.HighPart;
+  overlapped.hEvent = ::CreateEvent(NULL, TRUE, FALSE, NULL);
+
+  if (NULL == overlapped.hEvent) {
+    return -1;
+  }
+
+  SSIZE_T result = 0;
+
+  unsigned long bytes_read = 0;
+  DWORD last_error = ERROR_SUCCESS;
+
+  BOOL read_result = ::ReadFile(hfile, src, static_cast<DWORD>(num_bytes),
+                                &bytes_read, &overlapped);
+  if ((FALSE == read_result) &&
+      ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
+    result = (last_error == ERROR_HANDLE_EOF) ? 0 : -1;
+  } else {
+    if (ERROR_IO_PENDING == last_error) { // Otherwise bytes_read already has the result.
+      BOOL overlapped_result = ::GetOverlappedResult(hfile, &overlapped,
+                                                     &bytes_read, TRUE);
+      if (FALSE == overlapped_result) {
+        result = (::GetLastError() == ERROR_HANDLE_EOF) ? 0 : -1;
+      }
+      else {
+        result = bytes_read;
+      }
+    }
+  }
+
+  ::CloseHandle(overlapped.hEvent);
+
+  return result;
+}
+
 // read() based random-access
 class WindowsRandomAccessFile : public RandomAccessFile {
  private:
   string filename_;
-  FILE* file_;
+  HANDLE hfile_;
 
  public:
-  WindowsRandomAccessFile(const string& fname, FILE* f)
-      : filename_(fname), file_(f) {}
+  WindowsRandomAccessFile(const string& fname, HANDLE hfile)
+      : filename_(fname), hfile_(hfile) {}
   ~WindowsRandomAccessFile() override {
-    if (file_ != NULL) {
-      // Ignoring any potential errors
-      fclose(file_);
+    if (hfile_ != NULL && hfile_ != INVALID_HANDLE_VALUE) {
+      ::CloseHandle(hfile_);
     }
   }
 
@@ -59,13 +112,10 @@
               char* scratch) const override {
     Status s;
     char* dst = scratch;
-    int seek_result = fseek(file_, offset, SEEK_SET);
-    if (seek_result) {
-      return IOError(filename_, errno);
-    }
     while (n > 0 && s.ok()) {
-      size_t r = fread(dst, 1, n, file_);
+      SSIZE_T r = pread(hfile_, dst, n, offset);
       if (r > 0) {
+        offset += r;
         dst += r;
         n -= r;
       } else if (r == 0) {
@@ -84,104 +134,246 @@
 class WindowsWritableFile : public WritableFile {
  private:
   string filename_;
-  FILE* file_;
+  HANDLE hfile_;
 
  public:
-  WindowsWritableFile(const string& fname, FILE* f)
-      : filename_(fname), file_(f) {}
+  WindowsWritableFile(const string& fname, HANDLE hFile)
+      : filename_(fname), hfile_(hFile) {}
 
   ~WindowsWritableFile() override {
-    if (file_ != NULL) {
-      // Ignoring any potential errors
-      fclose(file_);
+    if (hfile_ != NULL && hfile_ != INVALID_HANDLE_VALUE) {
+      WindowsWritableFile::Close();
     }
   }
 
   Status Append(const StringPiece& data) override {
-    size_t r = fwrite(data.data(), 1, data.size(), file_);
-    if (r != data.size()) {
-      return IOError(filename_, errno);
+    DWORD bytes_written = 0;
+    DWORD data_size = static_cast<DWORD>(data.size());
+    BOOL write_result = ::WriteFile(hfile_, data.data(), data_size,
+                                    &bytes_written, NULL);
+    if (FALSE == write_result) {
+      return IOErrorFromWindowsError(
+          "Failed to WriteFile: " + filename_, ::GetLastError());
     }
+
+    assert(size_t(bytes_written) == data.size());
     return Status::OK();
   }
 
   Status Close() override {
-    Status result;
-    if (fclose(file_) != 0) {
-      result = IOError(filename_, errno);
+    assert(INVALID_HANDLE_VALUE != hfile_);
+
+    Status result = Flush();
+    if (!result.ok()) {
+      return result;
     }
-    file_ = NULL;
-    return result;
+
+    if (FALSE == ::CloseHandle(hfile_)) {
+      return IOErrorFromWindowsError(
+          "CloseHandle failed for: " + filename_, ::GetLastError());
+    }
+
+    hfile_ = INVALID_HANDLE_VALUE;
+    return Status::OK();
   }
 
   Status Flush() override {
-    if (fflush(file_) != 0) {
-      return IOError(filename_, errno);
+    if (FALSE == ::FlushFileBuffers(hfile_)) {
+      return IOErrorFromWindowsError(
+          "FlushFileBuffers failed for: " + filename_, ::GetLastError());
     }
     return Status::OK();
   }
 
   Status Sync() override {
-    Status s;
-    if (fflush(file_) != 0) {
-      s = IOError(filename_, errno);
-    }
-    return s;
+    return Flush();
   }
 };
 
+class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
+ private:
+  const std::string filename_;
+  HANDLE hfile_;
+  HANDLE hmap_;
+
+  const void* const address_;
+  const uint64 length_;
+
+ public:
+  WinReadOnlyMemoryRegion(const std::string& filename, HANDLE hfile,
+                          HANDLE hmap, const void* address, uint64 length)
+      : filename_(filename), hfile_(hfile), hmap_(hmap), address_(address),
+        length_(length) {}
+
+  ~WinReadOnlyMemoryRegion() {
+    BOOL ret = ::UnmapViewOfFile(address_);
+    assert(ret);
+
+    ret = ::CloseHandle(hmap_);
+    assert(ret);
+
+    ret = ::CloseHandle(hfile_);
+    assert(ret);
+  }
+
+  const void* data() override { return address_; }
+  uint64 length() override { return length_; }
+};
+
 }  // namespace
 
 Status WindowsFileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   string translated_fname = TranslateName(fname);
   result->reset();
-  Status s;
-  FILE* f = fopen(translated_fname.c_str(), "r");
-  if (f == NULL) {
-    s = IOError(fname, errno);
-  } else {
-    result->reset(new WindowsRandomAccessFile(translated_fname, f));
+
+  // Open the file for read-only random access
+  // Random access is to disable read-ahead as the system reads too much data
+  // Open in async mode which makes Windows allow more parallelism even
+  // if we need to do sync I/O on top of it.
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS |
+      FILE_FLAG_OVERLAPPED;
+  // Shared access is necessary for tests to pass
+  // almost all tests would work with a possible exception of fault_injection.
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_READ,
+                               share_mode, NULL, OPEN_EXISTING, file_flags,
+                               NULL);
+
+  if (INVALID_HANDLE_VALUE == hfile) {
+    string context = "NewRandomAccessFile failed to Create/Open: " + fname;
+    return IOErrorFromWindowsError(context, ::GetLastError());
   }
-  return s;
+
+  result->reset(new WindowsRandomAccessFile(translated_fname, hfile));
+  return Status::OK();
 }
 
 Status WindowsFileSystem::NewWritableFile(
     const string& fname, std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
-  Status s;
-  FILE* f = fopen(translated_fname.c_str(), "w");
-  if (f == NULL) {
-    result->reset();
-    s = IOError(fname, errno);
-  } else {
-    result->reset(new WindowsWritableFile(translated_fname, f));
+  result->reset();
+
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_WRITE,
+                               share_mode, NULL, CREATE_ALWAYS,
+                               FILE_ATTRIBUTE_NORMAL, NULL);
+
+  if (INVALID_HANDLE_VALUE == hfile) {
+    string context = "Failed to create a NewWriteableFile: " + fname;
+    return IOErrorFromWindowsError(context, ::GetLastError());
   }
-  return s;
+
+  result->reset(new WindowsWritableFile(translated_fname, hfile));
+  return Status::OK();
 }
 
 Status WindowsFileSystem::NewAppendableFile(
     const string& fname, std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
-  Status s;
-  FILE* f = fopen(translated_fname.c_str(), "a");
-  if (f == NULL) {
-    result->reset();
-    s = IOError(fname, errno);
-  } else {
-    result->reset(new WindowsWritableFile(translated_fname, f));
+  result->reset();
+
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_WRITE,
+                               share_mode, NULL, OPEN_ALWAYS,
+                               FILE_ATTRIBUTE_NORMAL, NULL);
+
+  if (INVALID_HANDLE_VALUE == hfile) {
+    string context = "Failed to create a NewAppendableFile: " + fname;
+    return IOErrorFromWindowsError(context, ::GetLastError());
   }
-  return s;
+
+  UniqueCloseHandlePtr file_guard(hfile, CloseHandleFunc);
+
+  DWORD file_ptr = ::SetFilePointer(hfile, NULL, NULL, FILE_END);
+  if (INVALID_SET_FILE_POINTER == file_ptr) {
+    string context = "Failed to create a NewAppendableFile: " + fname;
+    return IOErrorFromWindowsError(context, ::GetLastError());
+  }
+
+  result->reset(new WindowsWritableFile(translated_fname, hfile));
+  file_guard.release();
+
+  return Status::OK();
 }
 
 Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
     const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-  return errors::Unimplemented(
-      "WindowsFileSystem::NewReadOnlyMemoryRegionFromFile");
+  string translated_fname = TranslateName(fname);
+  result->reset();
+  Status s = Status::OK();
+
+  // Open the file for read-only random access
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS;
+
+  // Open in async mode which makes Windows allow more parallelism even
+  // if we need to do sync I/O on top of it.
+  file_flags |= FILE_FLAG_OVERLAPPED;
+
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_READ,
+                               share_mode, NULL, OPEN_EXISTING, file_flags,
+                               NULL);
+
+  if (INVALID_HANDLE_VALUE == hfile) {
+    return IOErrorFromWindowsError(
+        "NewReadOnlyMemoryRegionFromFile failed to Create/Open: " + fname,
+        ::GetLastError());
+  }
+
+  UniqueCloseHandlePtr file_guard(hfile, CloseHandleFunc);
+
+  // Use mmap when virtual address-space is plentiful.
+  uint64_t file_size;
+  s = GetFileSize(translated_fname, &file_size);
+  if (s.ok()) {
+    // Will not map empty files
+    if (file_size == 0) {
+      return IOError(
+          "NewReadOnlyMemoryRegionFromFile failed to map empty file: " + fname,
+          EINVAL);
+    }
+
+    HANDLE hmap = ::CreateFileMappingA(hfile, NULL, PAGE_READONLY,
+                                       0,  // Whole file at its present length
+                                       0,
+                                       NULL);  // Mapping name
+
+    if (!hmap) {
+      string context = "Failed to create file mapping for "
+                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      return IOErrorFromWindowsError(context, ::GetLastError());
+    }
+
+    UniqueCloseHandlePtr map_guard(hmap, CloseHandleFunc);
+
+    const void* mapped_region = ::MapViewOfFileEx(
+        hmap, FILE_MAP_READ,
+        0,  // High DWORD of access start
+        0,  // Low DWORD
+        file_size,
+        NULL);  // Let the OS choose the mapping
+
+    if (!mapped_region) {
+      string context = "Failed to MapViewOfFile for "
+                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      return IOErrorFromWindowsError(context, ::GetLastError());
+    }
+
+    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap,
+                                              mapped_region, file_size));
+
+    map_guard.release();
+    file_guard.release();
+  }
+
+  return s;
 }
 
 bool WindowsFileSystem::FileExists(const string& fname) {
-  return _access(TranslateName(fname).c_str(), 0) == 0;
+  constexpr int kOk = 0;
+  return _access(TranslateName(fname).c_str(), kOk) == 0;
 }
 
 Status WindowsFileSystem::GetChildren(const string& dir,
@@ -189,27 +381,39 @@
   string translated_dir = TranslateName(dir);
   result->clear();
 
+  string pattern = translated_dir;
+  if (!pattern.empty() && pattern.back() != '\\' && pattern.back() != '/') {
+    pattern += '\\*';
+  } else {
+    pattern += '*';
+  }
+
   WIN32_FIND_DATA find_data;
-  HANDLE find_handle = FindFirstFile(translated_dir.c_str(), &find_data);
+  HANDLE find_handle = ::FindFirstFileA(pattern.c_str(), &find_data);
   if (find_handle == INVALID_HANDLE_VALUE) {
-    // TODO(mrry): Convert to a more specific error.
-    return errors::Unknown("Error code: ", GetLastError());
+    string context = "FindFirstFile failed for: " + translated_dir;
+    return IOErrorFromWindowsError(context, ::GetLastError());
   }
-  result->push_back(find_data.cFileName);
-  while (FindNextFile(find_handle, &find_data)) {
-    result->push_back(find_data.cFileName);
+
+  do {
+    const StringPiece basename = find_data.cFileName;
+    if (basename != "." && basename != "..") {
+      result->push_back(find_data.cFileName);
+    }
+  } while (::FindNextFileA(find_handle, &find_data));
+
+  if (!::FindClose(find_handle)) {
+    string context = "FindClose failed for: " + translated_dir;
+    return IOErrorFromWindowsError(context, ::GetLastError());
   }
-  if (!FindClose(find_handle)) {
-    // TODO(mrry): Convert to a more specific error.
-    return errors::Unknown("Error closing find handle: ", GetLastError());
-  }
+
   return Status::OK();
 }
 
 Status WindowsFileSystem::DeleteFile(const string& fname) {
   Status result;
   if (unlink(TranslateName(fname).c_str()) != 0) {
-    result = IOError(fname, errno);
+    result = IOError("Failed to delete a file: " + fname, errno);
   }
   return result;
 }
@@ -217,7 +421,7 @@
 Status WindowsFileSystem::CreateDir(const string& name) {
   Status result;
   if (_mkdir(TranslateName(name).c_str()) != 0) {
-    result = IOError(name, errno);
+    result = IOError("Failed to create a directory: " + name, errno);
   }
   return result;
 }
@@ -225,42 +429,52 @@
 Status WindowsFileSystem::DeleteDir(const string& name) {
   Status result;
   if (_rmdir(TranslateName(name).c_str()) != 0) {
-    result = IOError(name, errno);
+    result = IOError("Failed to remove a directory: " + name, errno);
   }
   return result;
 }
 
 Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
-  Status s;
-  struct _stat sbuf;
-  if (_stat(TranslateName(fname).c_str(), &sbuf) != 0) {
-    *size = 0;
-    s = IOError(fname, errno);
-  } else {
-    *size = sbuf.st_size;
+  string translated_fname = TranslateName(fname);
+  Status result;
+  WIN32_FILE_ATTRIBUTE_DATA attrs;
+  if (TRUE == ::GetFileAttributesExA(translated_fname.c_str(),
+                                     GetFileExInfoStandard, &attrs)) {
+    ULARGE_INTEGER file_size;
+    file_size.HighPart = attrs.nFileSizeHigh;
+    file_size.LowPart = attrs.nFileSizeLow;
+    *size = file_size.QuadPart;
   }
-  return s;
+  else {
+    string context = "Can not get size for: " + fname;
+    result = IOErrorFromWindowsError(context, ::GetLastError());
+  }
+  return result;
 }
 
 Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
   Status result;
-  if (rename(TranslateName(src).c_str(), TranslateName(target).c_str()) != 0) {
-    result = IOError(src, errno);
+  // rename() is not capable of replacing the existing file as on Linux
+  // so use OS API directly
+  if (!::MoveFileExA(TranslateName(src).c_str(), TranslateName(target).c_str(),
+      MOVEFILE_REPLACE_EXISTING)) {
+    string context(strings::StrCat("Failed to rename: ", src, " to: ", target));
+    result = IOErrorFromWindowsError(context, ::GetLastError());
   }
   return result;
 }
 
 Status WindowsFileSystem::Stat(const string& fname, FileStatistics* stat) {
-  Status s;
+  Status result;
   struct _stat sbuf;
   if (_stat(TranslateName(fname).c_str(), &sbuf) != 0) {
-    s = IOError(fname, errno);
+    result = IOError(fname, errno);
   } else {
     stat->mtime_nsec = sbuf.st_mtime * 1e9;
     stat->length = sbuf.st_size;
     stat->is_directory = PathIsDirectory(TranslateName(fname).c_str());
   }
-  return s;
+  return result;
 }
 
 }  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index 68b391f..12b579b 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -64,7 +64,14 @@
   }
 };
 
-Status IOError(const string& context, int err_number);
+class LocalWinFileSystem : public WindowsFileSystem {
+public:
+    string TranslateName(const string& name) const override {
+      StringPiece scheme, host, path;
+      ParseURI(name, &scheme, &host, &path);
+      return path.ToString();
+    }
+};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index bab4572..1e8ae0b 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -20,7 +20,7 @@
 
 #define TF_MAJOR_VERSION 0
 #define TF_MINOR_VERSION 11
-#define TF_PATCH_VERSION 0rc0
+#define TF_PATCH_VERSION 0rc1
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
diff --git a/tensorflow/examples/learn/README.md b/tensorflow/examples/learn/README.md
index 0ae72ae..c1c7e9b 100644
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@@ -21,7 +21,7 @@
 * [Deep Neural Network with Customized Decay Function](iris_custom_decay_dnn.py)
 
 ## Specialized Models
-* [Building a Random Forest Model](random_forest.py)
+* [Building a Random Forest Model](random_forest_mnist.py)
 * [Building a Wide & Deep Model](wide_n_deep_tutorial.py)
 * [Building a Residual Network Model](resnet.py)
 
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 60fd433..532c868 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -84,7 +84,6 @@
     args = [
         "--fake_data",
         "--max_steps=10",
-        "--train_dir=/tmp/mnist",
     ],
     main = "fully_connected_feed.py",
     srcs_version = "PY2AND3",
diff --git a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
index c8262a0..7e4d408 100644
--- a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
+++ b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
@@ -117,7 +117,7 @@
   """Train MNIST for a number of steps."""
   # Get the sets of images and labels for training, validation, and
   # test on MNIST.
-  data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
+  data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
 
   # Tell TensorFlow that the model will be built into the default Graph.
   with tf.Graph().as_default():
@@ -146,13 +146,13 @@
     init = tf.initialize_all_variables()
 
     # Create a saver for writing training checkpoints.
-    saver = tf.train.Saver()
+    saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
 
     # Create a session for running Ops on the Graph.
     sess = tf.Session()
 
     # Instantiate a SummaryWriter to output summaries and the Graph.
-    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
+    summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, sess.graph)
 
     # And then after everything is built:
 
@@ -190,7 +190,7 @@
 
       # Save a checkpoint and evaluate the model periodically.
       if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
-        checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint')
+        checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
         saver.save(sess, checkpoint_file, global_step=step)
         # Evaluate against the training set.
         print('Training Data Eval:')
@@ -216,6 +216,9 @@
 
 
 def main(_):
+  if tf.gfile.Exists(FLAGS.log_dir):
+    tf.gfile.DeleteRecursively(FLAGS.log_dir)
+  tf.gfile.MakeDirs(FLAGS.log_dir)
   run_training()
 
 
@@ -252,10 +255,16 @@
       help='Batch size.  Must divide evenly into the dataset sizes.'
   )
   parser.add_argument(
-      '--train_dir',
+      '--input_data_dir',
       type=str,
-      default='data',
-      help='Directory to put the training data.'
+      default='/tmp/tensorflow/mnist/input_data',
+      help='Directory to put the input data.'
+  )
+  parser.add_argument(
+      '--log_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/logs/fully_connected_feed',
+      help='Directory to put the log data.'
   )
   parser.add_argument(
       '--fake_data',
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index 4c6f59e..beb184f 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -72,7 +72,7 @@
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
-  parser.add_argument('--data_dir', type=str, default='/tmp/data',
-                      help='Directory for storing data')
+  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
+                      help='Directory for storing input data')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 9fda00a..fc91ac4 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -137,9 +137,9 @@
 
   # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
   merged = tf.summary.merge_all()
-  train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
+  train_writer = tf.train.SummaryWriter(FLAGS.log_dir + '/train',
                                         sess.graph)
-  test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
+  test_writer = tf.train.SummaryWriter(FLAGS.log_dir + '/test')
   tf.initialize_all_variables().run()
 
   # Train the model, and also write summaries.
@@ -180,9 +180,9 @@
 
 
 def main(_):
-  if tf.gfile.Exists(FLAGS.summaries_dir):
-    tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
-  tf.gfile.MakeDirs(FLAGS.summaries_dir)
+  if tf.gfile.Exists(FLAGS.log_dir):
+    tf.gfile.DeleteRecursively(FLAGS.log_dir)
+  tf.gfile.MakeDirs(FLAGS.log_dir)
   train()
 
 
@@ -197,10 +197,9 @@
                       help='Initial learning rate')
   parser.add_argument('--dropout', type=float, default=0.9,
                       help='Keep probability for training dropout.')
-  parser.add_argument('--data_dir', type=str, default='/tmp/data',
-                      help='Directory for storing data')
-  parser.add_argument('--summaries_dir', type=str, default='/tmp/mnist_logs',
-                      help='Summaries directory')
-
+  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
+                      help='Directory for storing input data')
+  parser.add_argument('--log_dir', type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
+                      help='Summaries log directory')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
index 6d22f67..44388cc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
@@ -11,8 +11,8 @@
 At inference time, you can compute full softmax probabilities with the
 expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`.
 
-See our [Candidate Sampling Algorithms Reference]
-(../../extras/candidate_sampling.pdf)
+See our
+[Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
 
 Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
 ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md
index c2736f1..2e04ee2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md
@@ -17,7 +17,7 @@
                         filter[di, dj, k, q]
 
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
index b0fa637..aa2d46f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
@@ -42,8 +42,7 @@
       where a sampled class equals one of the target classes.  If set to
       `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
       learning to generate log-odds instead of log probabilities.  See
-      our [Candidate Sampling Algorithms Reference]
-      (../../extras/candidate_sampling.pdf).
+      our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf).
       Default is False.
 *  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
       if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
index 81134df..2738a61 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
@@ -11,8 +11,8 @@
         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
     output = input / (bias + alpha * sqr_sum) ** beta
 
-For details, see [Krizhevsky et al., ImageNet classification with deep
-convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+For details, see
+[Krizhevsky et al., ImageNet classification with deep convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md
index d40ed35..3f51a3b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md
@@ -22,7 +22,7 @@
                         filter[di, dj, q, k]
 
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/get_started/os_setup.md b/tensorflow/g3doc/get_started/os_setup.md
index 139e225..8d16894 100644
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@@ -63,37 +63,37 @@
 
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl
 
 # Mac OS X, GPU enabled, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py2-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py2-none-any.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py3-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py3-none-any.whl
 
 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py3-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py3-none-any.whl
 ```
 
 Install TensorFlow:
@@ -159,37 +159,37 @@
 
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl
 
 # Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py2-none-any.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py3-none-any.whl
 
 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py3-none-any.whl
 ```
 
 Finally install TensorFlow:
@@ -298,37 +298,37 @@
 
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl
 
 # Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py2-none-any.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py3-none-any.whl
 
 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py3-none-any.whl
 ```
 
 Finally install TensorFlow:
@@ -396,13 +396,13 @@
 code.
 
 We also have tags with `latest` replaced by a released version (e.g.,
-`0.11.0-gpu`).
+`0.11.0rc1-gpu`).
 
 With Docker the installation is as follows:
 
 *  Install Docker on your machine.
 *  Create a [Docker
-group](http://docs.docker.com/engine/installation/ubuntulinux/#create-a-docker-group)
+group](https://docs.docker.com/engine/installation/linux/ubuntulinux/#/create-a-docker-group)
 to allow launching containers without `sudo`.
 *  Launch a Docker container with the TensorFlow image.  The image
    gets downloaded automatically on first launch.
@@ -780,7 +780,7 @@
 $ bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
 
 # The name of the .whl file will depend on your platform.
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.11.0rc0-py2-none-any.whl
+$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.11.0rc1-py2-none-any.whl
 ```
 
 ## Setting up TensorFlow for Development
diff --git a/tensorflow/g3doc/tutorials/wide/index.md b/tensorflow/g3doc/tutorials/wide/index.md
index 1bad7ea..643599f 100644
--- a/tensorflow/g3doc/tutorials/wide/index.md
+++ b/tensorflow/g3doc/tutorials/wide/index.md
@@ -222,12 +222,12 @@
 feature values of a column and there are only a few of them, you can use
 `sparse_column_with_keys`. Each key in the list will get assigned an
 auto-incremental ID starting from 0. For example, for the `gender` column we can
-assign the feature string "female" to an integer ID of 0 and "male" to 1 by
+assign the feature string "Female" to an integer ID of 0 and "Male" to 1 by
 doing:
 
 ```python
 gender = tf.contrib.layers.sparse_column_with_keys(
-  column_name="gender", keys=["female", "male"])
+  column_name="gender", keys=["Female", "Male"])
 ```
 
 What if we don't know the set of possible values in advance? Not a problem. We
diff --git a/tensorflow/g3doc/tutorials/wide_and_deep/index.md b/tensorflow/g3doc/tutorials/wide_and_deep/index.md
index da7b2f7..760e4ba 100644
--- a/tensorflow/g3doc/tutorials/wide_and_deep/index.md
+++ b/tensorflow/g3doc/tutorials/wide_and_deep/index.md
@@ -16,7 +16,8 @@
 you're interested in learning more about how Wide & Deep Learning works, please
 check out our [research paper](http://arxiv.org/abs/1606.07792).
 
-![Wide & Deep Spectrum of Models](../../images/wide_n_deep.svg "Wide & Deep")
+![Wide & Deep Spectrum of Models]
+(../../images/wide_n_deep.svg "Wide & Deep")
 
 The figure above shows a comparison of a wide model (logistic regression with
 sparse features and transformations), a deep model (feed-forward neural network
@@ -85,7 +86,9 @@
 import tensorflow as tf
 
 # Categorical base columns.
-gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["female", "male"])
+gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])
+race = tf.contrib.layers.sparse_column_with_keys(column_name="race", keys=[
+  "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])
 education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
 relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
 workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
diff --git a/tensorflow/models/image/cifar10/cifar10.py b/tensorflow/models/image/cifar10/cifar10.py
index 4908964..fb3a42c 100644
--- a/tensorflow/models/image/cifar10/cifar10.py
+++ b/tensorflow/models/image/cifar10/cifar10.py
@@ -391,4 +391,5 @@
     print()
     statinfo = os.stat(filepath)
     print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
-    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
+  
+  tarfile.open(filepath, 'r:gz').extractall(dest_directory)
diff --git a/tensorflow/models/rnn/ptb/ptb_word_lm.py b/tensorflow/models/rnn/ptb/ptb_word_lm.py
index 39b5cb4..f4f2888 100644
--- a/tensorflow/models/rnn/ptb/ptb_word_lm.py
+++ b/tensorflow/models/rnn/ptb/ptb_word_lm.py
@@ -339,7 +339,7 @@
       tf.scalar_summary("Validation Loss", mvalid.cost)
 
     with tf.name_scope("Test"):
-      test_input = PTBInput(config=config, data=test_data, name="TestInput")
+      test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
       with tf.variable_scope("Model", reuse=True, initializer=initializer):
         mtest = PTBModel(is_training=False, config=eval_config,
                          input_=test_input)
@@ -347,7 +347,7 @@
     sv = tf.train.Supervisor(logdir=FLAGS.save_path)
     with sv.managed_session() as session:
       for i in range(config.max_max_epoch):
-        lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
+        lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
         m.assign_lr(session, config.learning_rate * lr_decay)
 
         print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 7e3206e..baa48ec 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -213,7 +213,7 @@
     additional_deps = ["//tensorflow:tensorflow_py"],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "matrix_triangular_solve_op_test",
     size = "small",
     srcs = ["matrix_triangular_solve_op_test.py"],
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index a3fddcb..77f783d 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -21,6 +21,7 @@
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
+from tensorflow.python.client import device_lib
 
 
 class Conv2DTransposeTest(tf.test.TestCase):
@@ -157,6 +158,119 @@
     err_tolerance = 0.0005
     self.assertLess(err, err_tolerance)
 
+  def testConv2DTransposeSingleStrideNCHW(self):
+    # `NCHW` data fomat is only supported for `GPU` device.
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True):
+        strides = [1, 1, 1, 1]
+
+        # Input, output: [batch, depth, height, width, depth]
+        x_shape = [2, 3, 6, 4]
+        y_shape = [2, 2, 6, 4]
+
+        # Filter: [kernel_height, kernel_width, output_depth, input_depth]
+        f_shape = [3, 3, 2, 3]
+
+        x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
+        f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
+
+        output = tf.nn.conv2d_transpose(x, f, y_shape, strides=strides,
+                                     padding="SAME", data_format='NCHW')
+
+        value = output.eval()
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[2]):
+            for w in xrange(y_shape[3]):
+              for h in xrange(y_shape[2]):
+                target = 4 * 3.0
+                h_in = h > 0 and h < y_shape[2] - 1
+                w_in = w > 0 and w < y_shape[3] - 1
+                if h_in and w_in:
+                  target += 5 * 3.0
+                elif h_in or w_in:
+                  target += 2 * 3.0
+                self.assertAllClose(target, value[n, k, h, w])
+
+  def testConv2DTransposeSameNCHW(self):
+    # `NCHW` data fomat is only supported for `GPU` device.
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True):
+        strides = [1, 1, 2, 2]
+
+        # Input, output: [batch, depth, height, width]
+        x_shape = [2, 3, 6, 4]
+        y_shape = [2, 2, 12, 8]
+
+        # Filter: [kernel_height, kernel_width, output_depth, input_depth]
+        f_shape = [3, 3, 2, 3]
+
+        x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
+        f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
+
+        output = tf.nn.conv2d_transpose(x, f, y_shape, strides=strides,
+                                          padding="SAME", data_format='NCHW')
+
+        value = output.eval()
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[2]):
+            for w in xrange(y_shape[3]):
+              for h in xrange(y_shape[2]):
+                target = 3.0
+                # We add a case for locations divisible by the stride.
+                h_in = h % strides[2] == 0 and h > 0 and h < y_shape[2] - 1
+                w_in = w % strides[3] == 0 and w > 0 and w < y_shape[3] - 1
+                if h_in and w_in:
+                  target += 9.0
+                elif h_in or w_in:
+                  target += 3.0
+                self.assertAllClose(target, value[n, k, h, w])
+
+  def testConv2DTransposeValidNCHW(self):
+    # `NCHW` data fomat is only supported for `GPU` device.
+    if tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True):
+        strides = [1, 1, 2, 2]
+
+        # Input, output: [batch, depth, height, width]
+        x_shape = [2, 3, 6, 4]
+        y_shape = [2, 2, 13, 9]
+
+        # Filter: [kernel_height, kernel_width, output_depth, input_depth]
+        f_shape = [3, 3, 2, 3]
+
+        x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
+        f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
+        output = tf.nn.conv2d_transpose(x, f, y_shape, strides=strides,
+                                        padding="VALID", data_format='NCHW')
+
+        value = output.eval()
+        cache_values = np.zeros(y_shape, dtype=np.float32)
+        # The amount of padding added
+        pad = 1
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[2]):
+            for w in xrange(pad, y_shape[3] - pad):
+              for h in xrange(pad, y_shape[2] - pad):
+                target = 3.0
+                # We add a case for locations divisible by the stride.
+                h_in = h % strides[
+                    2] == 0 and h > pad and h < y_shape[2] - 1 - pad
+                w_in = w % strides[
+                    3] == 0 and w > pad and w < y_shape[3] - 1 - pad
+                if h_in and w_in:
+                  target += 9.0
+                elif h_in or w_in:
+                  target += 3.0
+                cache_values[n, k, h, w] = target
+
+            # copy values in the border
+            cache_values[n, k, :, 0] = cache_values[n, k, :, 1]
+            cache_values[n, k, :, -1] = cache_values[n, k, :, -2]
+            cache_values[n, k, 0, :] = cache_values[n, k, 1, :]
+            cache_values[n, k, -1, :] = cache_values[n, k, -2, :]
+
+        self.assertAllClose(cache_values, value)
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index cef8bfd..60eb7c2 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -1356,6 +1356,18 @@
     elif x.dtype == np.float64:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
+  def testScalar(self):
+    c = True
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 3, 2) * 100
+    for t in [np.float16, np.float32, np.float64, np.int32, np.int64,
+              np.complex64, np.complex128]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      self._compare(c, xt, yt, use_gpu=False)
+      if t in [np.float16, np.float32, np.float64]:
+        self._compare(c, xt, yt, use_gpu=True)
+
   def testBasic(self):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
diff --git a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
index 411f51a..c415482 100644
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@@ -24,15 +24,17 @@
 class MatrixTriangularSolveOpTest(tf.test.TestCase):
 
   def _verifySolveAllWays(self, x, y, batch_dims=None):
-    for lower in True, False:
-      for adjoint in True, False:
-        self._verifySolve(x,
-                          y,
-                          lower=lower,
-                          adjoint=adjoint,
-                          batch_dims=batch_dims)
+    for use_gpu in True, False:
+      for lower in True, False:
+        for adjoint in True, False:
+          self._verifySolve(x,
+                            y,
+                            lower=lower,
+                            adjoint=adjoint,
+                            batch_dims=batch_dims,
+                            use_gpu=use_gpu)
 
-  def _verifySolve(self, x, y, lower=True, adjoint=False, batch_dims=None):
+  def _verifySolve(self, x, y, lower=True, adjoint=False, batch_dims=None, use_gpu=False):
     for np_type in [np.float32, np.float64]:
       a = x.astype(np_type)
       b = y.astype(np_type)
@@ -52,7 +54,7 @@
         a_np = np.tile(a_np, batch_dims + [1, 1])
         b = np.tile(b, batch_dims + [1, 1])
 
-      with self.test_session():
+      with self.test_session(use_gpu=use_gpu):
         tf_ans = tf.matrix_triangular_solve(a, b, lower=lower, adjoint=adjoint)
         out = tf_ans.eval()
         np_ans = np.linalg.solve(a_np, b)
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index b644f2a..4755edc 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -264,6 +264,42 @@
     print("elu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
+  def testGradGradFloat32(self):
+    with self.test_session():
+      x = tf.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5], name="x")
+      y = tf.nn.elu(x, name="elu")
+      z = tf.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32, order="F")
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           z[0],
+                                           [2, 5],
+                                           x_init_value=x_init)
+    print("elu (float32) gradient of gradient err = ", err)
+    self.assertLess(err, 1e-4)
 
+  def testGradGradFloat64(self):
+    with self.test_session():
+      x = tf.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5], dtype=tf.float64, name="x")
+      y = tf.nn.elu(x, name="elu")
+      z = tf.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float64, order="F")
+      err = tf.test.compute_gradient_error(x,
+                                           [2, 5],
+                                           z[0],
+                                           [2, 5],
+                                           x_init_value=x_init)
+    print("elu (float64) gradient of gradient err = ", err)
+    self.assertLess(err, 1e-6)
+
+    
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index f5655f8..1a34634 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1795,7 +1795,7 @@
   performed
   instead:
   ```prettyprint
-  tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
+  tf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]
   ```
 
   By setting the `reverse` kwarg to `True`, the cumprod is performed in the
@@ -1807,7 +1807,7 @@
 
   The `reverse` and `exclusive` kwargs can also be combined:
   ```prettyprint
-  tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
+  tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]
   ```
 
   Args:
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 6a35cfb..149bde4 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -25,7 +25,7 @@
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import gen_nn_ops
-
+from tensorflow.python.ops import gen_math_ops
 
 @ops.RegisterGradient("Conv2DBackpropInput")
 def _Conv2DBackpropInputGrad(op, grad):
@@ -268,6 +268,14 @@
   return gen_nn_ops._relu_grad(grad, op.outputs[0])
 
 
+@ops.RegisterGradient("EluGrad")
+def _EluGradGrad(op, grad):
+  x = op.inputs[1]
+  return (gen_nn_ops._elu_grad(grad, op.outputs[0]), 
+          gen_math_ops.select(x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + 1), 
+          array_ops.zeros(shape = array_ops.shape(x), dtype = x.dtype)))
+
+
 @ops.RegisterGradient("Relu6")
 def _Relu6Grad(op, grad):
   return gen_nn_ops._relu6_grad(grad, op.inputs[0])
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 421e767..5b08dcd 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1010,6 +1010,7 @@
                      output_shape,
                      strides,
                      padding="SAME",
+                     data_format="NHWC",
                      name=None):
   """The transpose of `conv2d`.
 
@@ -1020,7 +1021,8 @@
 
   Args:
     value: A 4-D `Tensor` of type `float` and shape
-      `[batch, height, width, in_channels]`.
+      `[batch, height, width, in_channels]` for `NHWC` data format or
+      `[batch, in_channels, height, width]` for `NCHW` data format.
     filter: A 4-D `Tensor` with the same type as `value` and shape
       `[height, width, output_channels, in_channels]`.  `filter`'s
       `in_channels` dimension must match that of `value`.
@@ -1030,6 +1032,7 @@
       dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the returned tensor.
 
   Returns:
@@ -1041,9 +1044,12 @@
   """
   with ops.name_scope(name, "conv2d_transpose",
                       [value, filter, output_shape]) as name:
+    if data_format not in ("NCHW", "NHWC"):
+      raise ValueError("data_format has to be either NCHW or NHWC.")
     value = ops.convert_to_tensor(value, name="value")
     filter = ops.convert_to_tensor(filter, name="filter")
-    if not value.get_shape()[3].is_compatible_with(filter.get_shape()[3]):
+    axis = 3 if data_format=="NHWC" else 1
+    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[3]):
       raise ValueError("input channels does not match filter's input channels, "
                        "{} != {}".format(value.get_shape()[3], filter.get_shape(
                        )[3]))
@@ -1055,10 +1061,10 @@
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
-      if not filter.get_shape()[2].is_compatible_with(output_shape[3]):
+      if not filter.get_shape()[2].is_compatible_with(output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[3], filter.get_shape()[2]))
+            "{} != {}".format(output_shape[axis], filter.get_shape()[2]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
@@ -1069,6 +1075,7 @@
                                             out_backprop=value,
                                             strides=strides,
                                             padding=padding,
+                                            data_format=data_format,
                                             name=name)
 
 
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index a86586d..1d9dc3f 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -68,7 +68,7 @@
       Must be positive.  See the decay computation above.
     decay_rate: A scalar `float32` or `float64` `Tensor` or a
       Python number.  The decay rate.
-    staircase: Boolean.  It `True` decay the learning rate at discrete intervals
+    staircase: Boolean.  If `True` decay the learning rate at discrete intervals
     name: String.  Optional name of the operation.  Defaults to
       'ExponentialDecay'.
 
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 54e00d5..def2805 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -15,7 +15,10 @@
 
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 
+#if !defined(PLATFORM_WINDOWS)
 #include <dirent.h>
+#endif
+
 #include <limits.h>
 #include <stddef.h>
 #include <stdio.h>
@@ -25,11 +28,13 @@
 #include <IOKit/kext/KextManager.h>
 #include <mach-o/dyld.h>
 #else
+#if !defined(PLATFORM_WINDOWS)
 #include <link.h>
-#include <sys/stat.h>
 #include <sys/sysmacros.h>
-#endif
 #include <unistd.h>
+#endif
+#include <sys/stat.h>
+#endif
 #include <algorithm>
 #include <memory>
 #include <vector>
@@ -135,7 +140,7 @@
               << "(" << port::Hostname() << ")";
   }
   CFRelease(kext_infos);
-#else
+#elif !defined(PLATFORM_WINDOWS)
   if (access(kDriverVersionPath, F_OK) != 0) {
     LOG(INFO) << "kernel driver does not appear to be running on this host "
               << "(" << port::Hostname() << "): "
@@ -158,7 +163,7 @@
 
 /* static */ void Diagnostician::LogDriverVersionInformation() {
   LOG(INFO) << "hostname: " << port::Hostname();
-
+#ifndef PLATFORM_WINDOWS
   if (VLOG_IS_ON(1)) {
     const char *value = getenv("LD_LIBRARY_PATH");
     string library_path = value == nullptr ? "" : value;
@@ -180,17 +185,17 @@
       closedir(dir);
     }
   }
-
   port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
   LOG(INFO) << "libcuda reported version is: "
             << DriverVersionStatusToString(dso_version);
 
   port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
   LOG(INFO) << "kernel reported version is: "
-            << DriverVersionStatusToString(kernel_version);
+	  << DriverVersionStatusToString(kernel_version);
+#endif
 
   // OS X kernel driver does not report version accurately
-#if !defined(__APPLE__)
+#if !defined(__APPLE__) && !defined(PLATFORM_WINDOWS)
   if (kernel_version.ok() && dso_version.ok()) {
     WarnOnDsoKernelMismatch(dso_version, kernel_version);
   }
@@ -227,6 +232,7 @@
       result = StringToDriverVersion(version);
     }
 #else
+#if !defined(PLATFORM_WINDOWS)
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
   // DSO and yields its version number into the callback data, when found.
   auto iterate_phdr =
@@ -259,6 +265,7 @@
 
   dl_iterate_phdr(iterate_phdr, &result);
 #endif
+#endif
 
   return result;
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 1c13379..64e0641 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -3200,6 +3200,7 @@
     Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
     const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
   LOG(FATAL) << "not yet implemented";  // TODO(leary)
+  return false;
 }
 
 bool CudnnSupport::DoNormalizeWithDimensions(
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 095c82a..6a4c689 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -19,8 +19,8 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <set>
-#include "tensorflow/stream_executor/platform/port.h"
 
+#include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/lib/casts.h"
@@ -38,6 +38,14 @@
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 
+#if defined(PLATFORM_WINDOWS)
+// TODO: in windows ARRAYSIZE is defined in winnt.h but including it
+//  here creates a conflict with cuda.h - for now define it here.
+#define ARRAYSIZE(a) \
+  ((sizeof(a) / sizeof(*(a))) / \
+  static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
+#endif
+
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 bool FLAGS_gpuexec_cuda_device_0_only = false;
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 52256a7..f69853d 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -18,8 +18,12 @@
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
 #endif
+#if defined(PLATFORM_WINDOWS)
+#include <windows.h>
+#define PATH_MAX MAX_PATH
+#else
 #include <unistd.h>
-
+#endif
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_event.h"
@@ -204,7 +208,12 @@
     _NSGetExecutablePath(unresolved_path, &buffer_size);
     CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
 #else
-    CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+#if defined(PLATFORM_WINDOWS)
+  HMODULE hModule = GetModuleHandle(NULL);
+  GetModuleFileName(hModule, exe_path, MAX_PATH);
+#else
+  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+#endif
 #endif
   // Make sure it's null-terminated:
   exe_path[sizeof(exe_path) - 1] = 0;
@@ -908,8 +917,10 @@
   // could use the file::* utilities).
   FILE *file = fopen(filename.c_str(), "r");
   if (file == nullptr) {
+#if !defined(PLATFORM_WINDOWS)
     LOG(ERROR) << "could not open file to read NUMA node: " << filename
                << "\nYour kernel may have been built without NUMA support.";
+#endif
     return kUnknownNumaNode;
   }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index a0ee677..88b3a4f 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -15,8 +15,6 @@
 
 #include "tensorflow/stream_executor/cuda/cuda_rng.h"
 
-#include <dlfcn.h>
-
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 #include "tensorflow/stream_executor/cuda/cuda_helpers.h"
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index c9b305a..319f456 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -18,13 +18,17 @@
 
 #include "tensorflow/stream_executor/dso_loader.h"
 
-#include <dlfcn.h>
 #include <limits.h>
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
 #endif
 #include <stdlib.h>
+#if defined(PLATFORM_WINDOWS)
+#include <windows.h>
+#define PATH_MAX MAX_PATH
+#else
 #include <unistd.h>
+#endif
 #include <initializer_list>
 #include <vector>
 
@@ -45,7 +49,7 @@
 string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 
 /* static */ port::Status DsoLoader::GetCublasDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                       "cublas", GetCudaVersion()),
                                   GetCudaLibraryDirPath()),
                       dso_handle);
@@ -55,35 +59,42 @@
   // libcudnn is versioned differently than the other libraries and may have a
   // different version number than other CUDA libraries.  See b/22397368 for
   // some details about the complications surrounding this.
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                       "cudnn", GetCudnnVersion()),
                                   GetCudaLibraryDirPath()),
                       dso_handle);
 }
 
 /* static */ port::Status DsoLoader::GetCufftDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                       "cufft", GetCudaVersion()),
                                   GetCudaLibraryDirPath()),
                       dso_handle);
 }
 
 /* static */ port::Status DsoLoader::GetCurandDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                       "curand", GetCudaVersion()),
                                   GetCudaLibraryDirPath()),
                       dso_handle);
 }
 
 /* static */ port::Status DsoLoader::GetLibcudaDsoHandle(void** dso_handle) {
+#if defined(PLATFORM_WINDOWS)
   return GetDsoHandle(
-      FindDsoPath(tensorflow::internal::FormatLibraryFileName("cuda", "1"),
+      FindDsoPath(port::Env::Default()->FormatLibraryFileName("nvcuda", ""),
                   GetCudaDriverLibraryPath()),
       dso_handle);
+#else
+  return GetDsoHandle(
+      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cuda", "1"),
+                  GetCudaDriverLibraryPath()),
+      dso_handle);
+#endif
 }
 
 /* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                       "cupti", GetCudaVersion()),
                                   GetCudaCuptiLibraryPath()),
                       dso_handle);
@@ -101,8 +112,6 @@
     return port::Status(port::error::INVALID_ARGUMENT,
                         "Only LoadKind::kLocal is currently supported");
   }
-  int dynload_flags =
-      RTLD_LAZY | (load_kind == LoadKind::kLocal ? RTLD_LOCAL : RTLD_GLOBAL);
   string path_string = path.ToString();
   port::Status s =
       port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle);
@@ -125,6 +134,9 @@
   char unresolved_path[buffer_size];
   _NSGetExecutablePath(unresolved_path, &buffer_size);
   CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
+#elif defined(PLATFORM_WINDOWS)
+  HMODULE hModule = GetModuleHandle(NULL);
+  GetModuleFileName(hModule, exe_path, MAX_PATH);
 #else
   CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
 #endif
@@ -159,6 +171,9 @@
 }
 
 /* static */ bool DsoLoader::TrySymbolicDereference(string* candidate) {
+#if defined(PLATFORM_WINDOWS)
+  return false;
+#else
   char buf[PATH_MAX];
   char* result = realpath(candidate->c_str(), buf);
   if (result == nullptr) {
@@ -168,6 +183,7 @@
           << result << "\"";
   *candidate = result;
   return true;
+#endif
 }
 
 /* static */ string DsoLoader::FindDsoPath(port::StringPiece library_name,
@@ -206,6 +222,8 @@
 /* static */ string DsoLoader::GetCudaDriverLibraryPath() {
 #if defined(__APPLE__)
   return "external/local_config_cuda/cuda/driver/lib";
+#elif defined(PLATFORM_WINDOWS)
+  return "";
 #else
   return "external/local_config_cuda/cuda/driver/lib64";
 #endif
diff --git a/tensorflow/stream_executor/lib/process_state.cc b/tensorflow/stream_executor/lib/process_state.cc
index fcf8847..be4295b 100644
--- a/tensorflow/stream_executor/lib/process_state.cc
+++ b/tensorflow/stream_executor/lib/process_state.cc
@@ -15,8 +15,13 @@
 
 #include "tensorflow/stream_executor/lib/process_state.h"
 
+#if defined(PLATFORM_WINDOWS)
+#include <direct.h>
+#include <stdlib.h>
+#include <WinSock2.h>
+#else
 #include <unistd.h>
-
+#endif
 #include <memory>
 
 namespace perftools {
@@ -27,7 +32,7 @@
   char hostname[1024];
   gethostname(hostname, sizeof hostname);
   hostname[sizeof hostname - 1] = 0;
-  return hostname;
+  return std::string(hostname);
 }
 
 bool GetCurrentDirectory(string* dir) {
diff --git a/tensorflow/stream_executor/lib/static_threadlocal.h b/tensorflow/stream_executor/lib/static_threadlocal.h
index a839420..6e2bd0d 100644
--- a/tensorflow/stream_executor/lib/static_threadlocal.h
+++ b/tensorflow/stream_executor/lib/static_threadlocal.h
@@ -16,6 +16,10 @@
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
 
+#ifdef _MSC_VER
+#define __thread __declspec(thread) 
+#endif
+
 // For POD types in TLS mode, s_obj_VAR is the thread-local variable.
 #define SE_STATIC_THREAD_LOCAL_POD(_Type_, _var_)               \
   static __thread _Type_ s_obj_##_var_;                         \
diff --git a/tensorflow/tensorboard/backend/server.py b/tensorflow/tensorboard/backend/server.py
index 630d342..f590b5e 100644
--- a/tensorflow/tensorboard/backend/server.py
+++ b/tensorflow/tensorboard/backend/server.py
@@ -81,7 +81,7 @@
     else:
       run_name = None
       path = specification
-    if not io_wrapper.IsGCSPath(path):
+    if not (io_wrapper.IsGCSPath(path) or path.startswith('hdfs://')):
       path = os.path.realpath(path)
     files[path] = run_name
   return files
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 6db3978..24fd1d3 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -563,7 +563,7 @@
   for dep in ctx.attr.deps:
     inputs += dep.cc.transitive_headers
   inputs += ctx.files._swiglib
-  swig_include_dirs = set([f.root.path for f in inputs if f.root.path])
+  swig_include_dirs = set(_get_repository_roots(ctx, inputs))
   swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
   args = ["-c++",
           "-python",
@@ -616,6 +616,35 @@
     implementation = _py_wrap_cc_impl,
 )
 
+def _get_repository_roots(ctx, files):
+  """Returns abnormal root directories under which files reside.
+
+  When running a ctx.action, source files within the main repository are all
+  relative to the current directory; however, files that are generated or exist
+  in remote repositories will have their root directory be a subdirectory,
+  e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
+  returns the set of these devious directories, ranked and sorted by popularity
+  in order to hopefully minimize the number of I/O system calls within the
+  compiler, because includes have quadratic complexity.
+  """
+  result = {}
+  for f in files:
+    root = f.root.path
+    if root:
+      if root not in result:
+        result[root] = 0
+      result[root] -= 1
+    work = f.owner.workspace_root
+    if work:
+      if root:
+        root += "/"
+      root += work
+    if root:
+      if root not in result:
+        result[root] = 0
+      result[root] -= 1
+  return [k for v, k in sorted([(v, k) for k, v in result.items()])]
+
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
   outputs = set()
diff --git a/tensorflow/tools/ci_build/builds/test_installation.sh b/tensorflow/tools/ci_build/builds/test_installation.sh
index 5d7d6ec..09b2ae5 100755
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ b/tensorflow/tools/ci_build/builds/test_installation.sh
@@ -47,10 +47,6 @@
 # TF_BUILD_BAZEL_CLEAN, if set to any non-empty and non-0 value, directs the
 # script to perform bazel clean prior to main build and test steps.
 #
-# TF_BUILD_SERIAL_INSTALL_TESTS, if set to any non-empty and non-0 value,
-# will force the Python install tests to run serially, overriding than the
-# concurrent testing behavior.
-#
 # TF_GPU_COUNT, Set the number of GPUs in the system. We run only this many
 # concurrent tests when running GPU tests.
 #
@@ -411,21 +407,21 @@
 FAILED_TESTS=""
 FAILED_TEST_LOGS=""
 
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-if [[ -z ${N_JOBS} ]]; then
-  # Try the Mac way of getting number of CPUs
-  N_JOBS=$(sysctl -n hw.ncpu)
-fi
-
-if [[ -z ${N_JOBS} ]]; then
-  N_JOBS=8
-  echo "Cannot determine the number of processors"
-  echo "Using default concurrent job counter ${N_JOBS}"
-fi
-
-if [[ ! -z "${TF_BUILD_SERIAL_INSTALL_TESTS}" ]] &&
-   [[ "${TF_BUILD_SERIAL_INSTALL_TESTS}" != "0" ]]; then
+if [[ "${IS_GPU}" == "1" ]]; then
   N_JOBS=$TF_GPU_COUNT
+else
+  N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+  if [[ -z ${N_JOBS} ]]; then
+    # Try the Mac way of getting number of CPUs
+    N_JOBS=$(sysctl -n hw.ncpu)
+  fi
+
+  # If still cannot determine the number of CPUs, pick 8.
+  if [[ -z ${N_JOBS} ]]; then
+    N_JOBS=8
+    echo "Cannot determine the number of processors"
+    echo "Using default concurrent job counter ${N_JOBS}"
+  fi
 fi
 
 echo "Running Python tests-on-install with ${N_JOBS} concurrent jobs..."
@@ -485,9 +481,14 @@
     TEST_LOGS="${TEST_LOGS} ${TEST_LOG}"
 
     # Launch test asynchronously
-    "${SCRIPT_DIR}/../gpu_build/parallel_gpu_execute.sh" \
+    if [[ "${IS_GPU}" == "1" ]]; then
+      "${SCRIPT_DIR}/../gpu_build/parallel_gpu_execute.sh" \
+        "${SCRIPT_DIR}/py_test_delegate.sh" \
+        "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
+    else
       "${SCRIPT_DIR}/py_test_delegate.sh" \
-      "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
+        "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
+    fi
 
     if [[ "${TEST_COUNTER}" -ge "${N_PAR_TESTS}" ]]; then
       # Run in exclusive mode
diff --git a/tensorflow/tools/ci_build/builds/test_tutorials.sh b/tensorflow/tools/ci_build/builds/test_tutorials.sh
old mode 100644
new mode 100755
index aafa76b..aaad47c
--- a/tensorflow/tools/ci_build/builds/test_tutorials.sh
+++ b/tensorflow/tools/ci_build/builds/test_tutorials.sh
@@ -146,7 +146,7 @@
 
   run_in_directory "${TEST_DIR}" "${LOG_FILE}" \
     tensorflow/examples/tutorials/mnist/mnist_with_summaries.py \
-    --data_dir="${TUT_TEST_DATA_DIR}/mnist" --summaries_dir="${SUMMARIES_DIR}"
+    --data_dir="${TUT_TEST_DATA_DIR}/mnist" --log_dir="${SUMMARIES_DIR}"
 
   # Verify final accuracy
   FINAL_ACCURACY=$(grep "Accuracy at step" "${LOG_FILE}" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 0f165cd..54587ef 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -103,10 +103,8 @@
 BUILD_TAG="${BUILD_TAG:-tf_ci}"
 
 # Add extra params for cuda devices and libraries for GPU container.
-if [ "${CONTAINER_TYPE}" == "gpu" ]; then
-  # GPU pip tests-on-install concurrency is limited to the number of GPUs.
-  GPU_EXTRA_PARAMS="${GPU_EXTRA_PARAMS} -e TF_BUILD_SERIAL_INSTALL_TESTS=1"
-else
+# And clear them if we are not building for GPU.
+if [ "${CONTAINER_TYPE}" != "gpu" ]; then
   GPU_EXTRA_PARAMS=""
 fi
 
diff --git a/tensorflow/tools/dist_test/build_server.sh b/tensorflow/tools/dist_test/build_server.sh
index 178fba8..878fabd 100755
--- a/tensorflow/tools/dist_test/build_server.sh
+++ b/tensorflow/tools/dist_test/build_server.sh
@@ -16,7 +16,14 @@
 #
 # Builds the test server for distributed (GRPC) TensorFlow
 #
-# Usage: build_server.sh <docker_image_name> [--test]
+# Usage: build_server.sh <docker_image_name> <whl_url> [--test]
+#
+# Arguments:
+#   docker_image_name: Name of the docker image to build.
+#     E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1
+#
+#   whl_url: URL from which the TensorFlow whl file will be downloaded.
+#     E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # The optional flag --test lets the script to use the Dockerfile for the
 # testing GRPC server. Without the flag, the script will build the non-test
@@ -33,22 +40,35 @@
 }
 
 # Check arguments
-if [[ $# != 1 ]] && [[ $# != 2 ]]; then
-  die "Usage: $0 <docker_image_name> [--test]"
+if [[ $# -lt 2 ]]; then
+  die "Usage: $0 <docker_image_name> <whl_url> [--test]"
 fi
 
 DOCKER_IMG_NAME=$1
-shift
+WHL_URL=$2
+shift 2
 
 # Current script directory
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-DOCKER_FILE="${DIR}/server/Dockerfile"
+BUILD_DIR=$(mktemp -d)
+echo ""
+echo "Using whl file URL: ${WHL_URL}"
+echo "Building in temporary directory: ${BUILD_DIR}"
+
+cp -r ${DIR}/* "${BUILD_DIR}"/ || \
+    die "Failed to copy files to ${BUILD_DIR}"
+
+DOCKER_FILE="${BUILD_DIR}/server/Dockerfile"
 if [[ $1 == "--test" ]]; then
-  DOCKER_FILE="${DIR}/server/Dockerfile.test"
+  DOCKER_FILE="${BUILD_DIR}/server/Dockerfile.test"
 fi
 echo "Using Docker file: ${DOCKER_FILE}"
 
+# Download whl file into the build context directory.
+wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+
 if [[ ! -f "${DOCKER_FILE}" ]]; then
   die "ERROR: Unable to find dockerfile: ${DOCKER_FILE}"
 fi
@@ -56,5 +76,8 @@
 
 # Call docker build
 docker build --no-cache -t "${DOCKER_IMG_NAME}" \
-   -f "${DOCKER_FILE}" \
-   "${DIR}"
+   -f "${DOCKER_FILE}" "${BUILD_DIR}" || \
+   die "Failed to build docker image: ${DOCKER_IMG_NAME}"
+
+# Clean up docker build context directory.
+rm -rf "${BUILD_DIR}"
diff --git a/tensorflow/tools/dist_test/server/Dockerfile b/tensorflow/tools/dist_test/server/Dockerfile
index 9cc61a8..4b13b81 100644
--- a/tensorflow/tools/dist_test/server/Dockerfile
+++ b/tensorflow/tools/dist_test/server/Dockerfile
@@ -34,9 +34,10 @@
     python get-pip.py && \
     rm get-pip.py
 
-# Install TensorFlow CPU version from nightly build
-RUN pip --no-cache-dir install \
-    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Install TensorFlow wheel
+COPY tensorflow-*.whl /
+RUN pip install /tensorflow-*.whl && \
+    rm -f /tensorflow-*.whl
 
 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
diff --git a/tensorflow/tools/dist_test/server/Dockerfile.test b/tensorflow/tools/dist_test/server/Dockerfile.test
index 5bafa29..22438f3 100644
--- a/tensorflow/tools/dist_test/server/Dockerfile.test
+++ b/tensorflow/tools/dist_test/server/Dockerfile.test
@@ -40,9 +40,10 @@
 # Install python panda for the census wide&deep test
 RUN pip install --upgrade pandas==0.18.1
 
-# Install TensorFlow CPU version.
-RUN pip --no-cache-dir install \
-    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+# Install TensorFlow wheel
+COPY tensorflow-*.whl /
+RUN pip install /tensorflow-*.whl && \
+    rm -f /tensorflow-*.whl
 
 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index bd0b4cc..a8f5f26 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -33,7 +33,7 @@
         && \
     python -m ipykernel.kernelspec
 
-ENV TENSORFLOW_VERSION 0.11.0rc0
+ENV TENSORFLOW_VERSION 0.11.0rc1
 
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 9db6b73..9ad57d6 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -33,7 +33,7 @@
         && \
     python -m ipykernel.kernelspec
 
-ENV TENSORFLOW_VERSION 0.11.0rc0
+ENV TENSORFLOW_VERSION 0.11.0rc1
 
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index 0abe3d6..782a63f 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -17,7 +17,7 @@
 
 # Install nightly TensorFlow pip
 RUN pip install \
-   https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+   https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 
 # Copy test files
 RUN mkdir -p /gcs-smoke/python
diff --git a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
index 68800d6..2ce0fb3 100755
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@@ -81,7 +81,6 @@
 cat ${LOG_FILE}
 echo ""
 
-
 # Clean up the newly created tfrecord file in GCS bucket.
 # First, activate gcloud service account
 "${GCLOUD_BIN}" auth activate-service-account \
@@ -96,13 +95,3 @@
 "${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
     echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
     die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
-
-# Also clean up newly created GCS dir.
-NEW_DIR_URL=$(grep "Creating dir" "${LOG_FILE}" | \
-                awk '{print $NF}')
-if [[ -z ${NEW_DIR_URL} ]]; then
-  die "FAIL: Unable to determine the URL to the new directory created in GCS."
-fi
-"${GSUTIL_BIN}" rm -r "${NEW_DIR_URL}" && \
-    echo "Cleaned up new directory created in GCS: ${NEW_DIR_URL}" || \
-    die "FAIL: Unable to clean up new directory created in GCS: ${NEW_DIR_URL}"
diff --git a/tensorflow/tools/gcs_test/python/gcs_smoke.py b/tensorflow/tools/gcs_test/python/gcs_smoke.py
index 0e0018f..23f45a9 100644
--- a/tensorflow/tools/gcs_test/python/gcs_smoke.py
+++ b/tensorflow/tools/gcs_test/python/gcs_smoke.py
@@ -35,7 +35,6 @@
 
 FLAGS = flags.FLAGS
 
-
 def create_examples(num_examples, input_mean):
   """Create ExampleProto's containg data."""
   ids = np.arange(num_examples).reshape([num_examples, 1])
@@ -64,12 +63,48 @@
   print("%s directory exists: %s" % (dir_name, dir_exists))
 
   # List contents of just created directory.
-  starttime = int(round(time.time() * 1000))
   print("Listing directory %s." % dir_name)
+  starttime = int(round(time.time() * 1000))
   print(file_io.list_directory(dir_name))
   elapsed = int(round(time.time() * 1000)) - starttime
   print("Listed directory %s in %s milliseconds" % (dir_name, elapsed))
 
+  # Delete directory.
+  print("Deleting directory %s." % dir_name)
+  starttime = int(round(time.time() * 1000))
+  file_io.delete_recursively(dir_name)
+  elapsed = int(round(time.time() * 1000)) - starttime
+  print("Deleted directory %s in %s milliseconds" % (dir_name, elapsed))
+
+def create_object_test():
+  """Verifies file_io's object manipulation methods ."""
+  starttime = int(round(time.time() * 1000))
+  dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime)
+  print("Creating dir %s." % dir_name)
+  file_io.create_dir(dir_name)
+
+  # Create a file in this directory.
+  file_name = "%s/test_file.txt" % dir_name
+  print("Creating file %s." % file_name)
+  file_io.write_string_to_file(file_name, "test file creation.")
+
+  list_files_pattern = "%s/test_file*.txt" % dir_name
+  print("Getting files matching pattern %s." % list_files_pattern)
+  files_list = file_io.get_matching_files(list_files_pattern)
+  print(files_list)
+
+  assert len(files_list) == 1
+  assert files_list[0] == file_name
+
+  # Cleanup test files.
+  print("Deleting file %s." % file_name)
+  file_io.delete_file(file_name)
+
+  # Delete directory.
+  print("Deleting directory %s." % dir_name)
+  file_io.delete_recursively(dir_name)
+
+
 if __name__ == "__main__":
   # Sanity check on the GCS bucket URL.
   if not FLAGS.gcs_bucket_url or not FLAGS.gcs_bucket_url.startswith("gs://"):
@@ -132,4 +167,5 @@
         print("Successfully caught the expected OutOfRangeError while "
               "reading one more record than is available")
 
-    create_dir_test()
+  create_dir_test()
+  create_object_test()
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index db9de3f..267bcad 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -147,7 +147,7 @@
   """
   unknown_label = b"unknown"
   try:
-    val = subprocess.check_output(["git", "-C", git_base_path, "describe",
+    val = subprocess.check_output(["git", str("--git-dir="+git_base_path+"/.git"), str("--work-tree="+git_base_path), "describe",
                                    "--long", "--dirty", "--tags"]).strip()
     return val if val else unknown_label
   except subprocess.CalledProcessError:
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 34b6a58..2539ad4 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -107,7 +107,8 @@
   mkdir -p ${TMPDIR}/third_party
   pushd ${RUNFILES%org_tensorflow}
   for header in $(find protobuf -name \*.h); do
-    cp --parents "$header" ${TMPDIR}/google;
+    mkdir -p "${TMPDIR}/google/$(dirname ${header})"
+    cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
   popd
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index e458f12..9b475e5 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -26,7 +26,7 @@
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-_VERSION = '0.11.0rc0'
+_VERSION = '0.11.0rc1'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
diff --git a/tensorflow/tools/swig/.gitignore b/tensorflow/tools/swig/.gitignore
new file mode 100644
index 0000000..a14f886
--- /dev/null
+++ b/tensorflow/tools/swig/.gitignore
@@ -0,0 +1 @@
+swig_path
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b13e6c7..4be2490 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -98,9 +98,9 @@
 
   native.http_archive(
     name = "protobuf",
-    url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz",
-    sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb",
-    strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e",
+    url = "http://github.com/google/protobuf/archive/008b5a228b37c054f46ba478ccafa5e855cb16db.tar.gz",
+    sha256 = "2737ad055eb8a9bc63ed068e32c4ea280b62d8236578cb4d4120eb5543f759ab",
+    strip_prefix = "protobuf-008b5a228b37c054f46ba478ccafa5e855cb16db",
   )
 
   native.new_http_archive(
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index 7fafd2a..00d2e7c 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -1,3 +1,6 @@
+#ifdef _WIN32
+#define sleep(seconds) Sleep(1000*seconds)
+#endif  // _WIN32
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #ifdef _WIN32
diff --git a/util/python/python_config.sh b/util/python/python_config.sh
index 50f6398..8a780c8 100755
--- a/util/python/python_config.sh
+++ b/util/python/python_config.sh
@@ -113,29 +113,33 @@
     echo -e "\n\nERROR: Problem getting python include path.  Is distutils installed?"
     exit 1
   fi
-  local python_lib_path
-  # Split python_path into an array of paths, this allows path containing spaces
-  IFS=','
-  python_lib_path=($(python_path))
-  unset IFS
-  echo "Found possible Python library paths:"
-  for x in "${python_lib_path[@]}"; do
-    echo "  $x"
-  done
-  set -- "${python_lib_path[@]}"
-  echo "Please input the desired Python library path to use.  Default is ["$1"]"
-  read b || true
-  if [ "$b" == "" ]; then
-   python_lib="$(default_python_path "${python_lib_path[0]}")"
-   echo $python_lib
-  else
-    if test -d "$b" -a -x "$b"; then
-      python_lib="$b"
+
+  if [ -z "$PYTHON_LIB_PATH" ]; then
+    local python_lib_path
+    # Split python_path into an array of paths, this allows path containing spaces
+    IFS=','
+    python_lib_path=($(python_path))
+    unset IFS
+    echo "Found possible Python library paths:"
+    for x in "${python_lib_path[@]}"; do
+      echo "  $x"
+    done
+    set -- "${python_lib_path[@]}"
+    echo "Please input the desired Python library path to use.  Default is ["$1"]"
+    read b || true
+    if [ "$b" == "" ]; then
+      PYTHON_LIB_PATH="$(default_python_path "${python_lib_path[0]}")"
+      echo $PYTHON_LIB_PATH
     else
-      echo -e "\n\nERROR: The path you have entered does not exist."
-      exit 1
+      PYTHON_LIB_PATH="$b"
     fi
   fi
+  if test -d "$PYTHON_LIB_PATH" -a -x "$PYTHON_LIB_PATH"; then
+    python_lib="$PYTHON_LIB_PATH"
+  else
+    echo -e "\n\nERROR: Invalid python library path: ${PYTHON_LIB_PATH}."
+    exit 1
+  fi
 
   local numpy_include=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import numpy; print(numpy.get_include());')
   if [ "$numpy_include" == "" ]; then