[ROCm] Updating Dockerfile.rocm / CI scripts to use ROCm 3.3
diff --git a/configure.py b/configure.py
index 4dd3469..f051fab 100644
--- a/configure.py
+++ b/configure.py
@@ -1171,14 +1171,16 @@
test_only_filters = ['-oss_serial']
if is_windows():
test_and_build_filters.append('-no_windows')
- if environ_cp.get('TF_NEED_CUDA', None) == '1':
+ if ((environ_cp.get('TF_NEED_CUDA', None) == '1') or
+ (environ_cp.get('TF_NEED_ROCM', None) == '1')):
test_and_build_filters += ['-no_windows_gpu', '-no_gpu']
else:
test_and_build_filters.append('-gpu')
elif is_macos():
test_and_build_filters += ['-gpu', '-nomac', '-no_mac']
elif is_linux():
- if environ_cp.get('TF_NEED_CUDA', None) == '1':
+ if ((environ_cp.get('TF_NEED_CUDA', None) == '1') or
+ (environ_cp.get('TF_NEED_ROCM', None) == '1')):
test_and_build_filters.append('-no_gpu')
write_to_bazelrc('test --test_env=LD_LIBRARY_PATH')
else:
@@ -1416,6 +1418,10 @@
write_action_env_to_bazelrc('LD_LIBRARY_PATH',
environ_cp.get('LD_LIBRARY_PATH'))
+ if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('ROCM_PATH')):
+ write_action_env_to_bazelrc('ROCM_PATH',environ_cp.get('ROCM_PATH'))
+ write_action_env_to_bazelrc('ROCM_ROOT',environ_cp.get('ROCM_PATH'))
+
environ_cp['TF_NEED_CUDA'] = str(
int(get_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)))
if (environ_cp.get('TF_NEED_CUDA') == '1' and
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index c1928c8..6d12420 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -3,8 +3,8 @@
FROM ubuntu:xenial
MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.0/
-ARG ROCM_PATH=/opt/rocm
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/
+ARG ROCM_PATH=/opt/rocm-3.3.0
ENV DEBIAN_FRONTEND noninteractive
ENV TF_NEED_ROCM 1
@@ -71,7 +71,21 @@
ENV PATH="$OPENCL_ROOT/bin:${PATH}"
# Add target file to help determine which device(s) to build for
-RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> /opt/rocm/bin/target.lst'
+RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> ${ROCM_PATH}/bin/target.lst'
+
+# Need to explicitly create the $ROCM_PATH/.info/version file to workaround what seems to be a bazel bug
+# The env vars being set via --action_env in .bazelrc and .tf_configure.bazelrc files are sometimes
+# not getting set in the build command being spawned by bazel (in theory this should not happen)
+# As a consequence ROCM_PATH is sometimes not set for the hipcc commands.
+# When hipcc incokes hcc, it specifies $ROCM_PATH/.../include dirs via the `-isystem` options
+# If ROCM_PATH is not set, it defaults to /opt/rocm, and as a consequence a dependency is generated on the
+# header files included within `/opt/rocm`, which then leads to bazel dependency errors
+# Explicitly creating the $ROCM_PATH/.info/version allows ROCM path to be set correrctly, even when ROCM_PATH
+# is not explicitly set, and thus avoids the eventual bazel dependency error.
+# The bazel bug needs to be root-caused and addressed, but that is out of our control and may take a long time
+# to come to fruition, so implementing the workaround to make do till then
+# Filed https://github.com/bazelbuild/bazel/issues/11163 for tracking this
+RUN touch ${ROCM_PATH}/.info/version
# Copy and run the install scripts.
COPY install/*.sh /install/
@@ -90,3 +104,7 @@
# Configure the build for our ROCm configuration.
ENV TF_NEED_ROCM 1
+# This is a temporary workaround to fix Out-Of-Memory errors we are running into with XLA perf tests
+# By default, HIP runtime "hides" 256MB from the TF Runtime, but with recent changes (update to ROCm2.3, dynamic loading of roc* libs, et al)
+# it seems that we need to up the threshold slightly to 320MB
+ENV HIP_HIDDEN_FREE_MEM=320
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
index 08d99f4..f1912c5 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
@@ -30,6 +30,7 @@
export CC_OPT_FLAGS='-mavx'
export TF_NEED_ROCM=1
+export ROCM_PATH=/opt/rocm-3.3.0
export TF_GPU_COUNT=${N_GPUS}
yes "" | $PYTHON_BIN_PATH configure.py
@@ -42,7 +43,7 @@
--test_lang_filters=cc \
--jobs=${N_JOBS} \
--local_test_jobs=${TF_GPU_COUNT}\
- --test_timeout 300,450,1200,3600 \
+ --test_timeout 600,900,2400,7200 \
--build_tests_only \
--test_output=errors \
--test_sharding_strategy=disabled \
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
index 61813df..df69044 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
@@ -30,6 +30,7 @@
export CC_OPT_FLAGS='-mavx'
export TF_NEED_ROCM=1
+export ROCM_PATH=/opt/rocm-3.3.0
export TF_GPU_COUNT=${N_GPUS}
yes "" | $PYTHON_BIN_PATH configure.py
@@ -38,12 +39,13 @@
bazel test \
--config=rocm \
-k \
- --test_tag_filters=gpu,-no_gpu,-no_rocm,-benchmark-test,-no_oss,-oss_serial,-rocm_multi_gpu, \
- --test_timeout 600,900,2400,7200 \
- --test_output=errors \
+ --test_tag_filters=gpu,-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
--jobs=${N_JOBS} \
--local_test_jobs=${TF_GPU_COUNT} \
+ --test_timeout 600,900,2400,7200 \
+ --test_output=errors \
--test_sharding_strategy=disabled \
+ --test_size_filters=small,medium \
--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-- \
//tensorflow/... \
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
index 64bfffa..b255789 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@@ -30,6 +30,7 @@
export CC_OPT_FLAGS='-mavx'
export TF_NEED_ROCM=1
+export ROCM_PATH=/opt/rocm-3.3.0
export TF_GPU_COUNT=${N_GPUS}
yes "" | $PYTHON_BIN_PATH configure.py
@@ -46,6 +47,7 @@
--build_tests_only \
--test_output=errors \
--test_sharding_strategy=disabled \
+ --test_size_filters=small,medium \
--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-- \
//tensorflow/... \
diff --git a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
index 9288b7b35..6ce1fad 100755
--- a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
@@ -30,6 +30,7 @@
export CC_OPT_FLAGS='-mavx'
export TF_NEED_ROCM=1
+export ROCM_PATH=/opt/rocm-3.3.0
export TF_GPU_COUNT=${N_GPUS}
yes "" | $PYTHON_BIN_PATH configure.py
@@ -47,6 +48,7 @@
--build_tests_only \
--test_output=errors \
--test_sharding_strategy=disabled \
+ --test_size_filters=small,medium \
--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-- \
//tensorflow/compiler/... \