Upgrade to Tensorflow 2.6 (#182)

Philmod · web-flow · commit 84df82df3150 · 2022-04-07T19:54:34.000-04:00
http://b/177304566
diff --git a/Dockerfile b/Dockerfile
@@ -58,7 +58,7 @@ RUN R -e 'reticulate::install_miniconda()'
 ENV RETICULATE_PYTHON=/root/.local/share/r-miniconda/envs/r-reticulate/bin/python
 
 # Tensorflow and Keras
-RUN R -e 'keras::install_keras(tensorflow = "2.3", extra_packages = c("pandas", "numpy", "pycryptodome"), method="conda")'
+RUN R -e 'keras::install_keras(tensorflow = "2.6", extra_packages = c("pandas", "numpy", "pycryptodome"), method="conda")'
 
 # Install kaggle libraries.
 # Do this at the end to avoid rebuilding everything when any change is made.
diff --git a/gpu.Dockerfile b/gpu.Dockerfile
@@ -1,22 +1,20 @@
 ARG BASE_TAG=staging
-FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 AS nvidia
+FROM nvidia/cuda:11.4.2-cudnn8-devel-ubuntu18.04 AS nvidia
 FROM gcr.io/kaggle-images/rstats:${BASE_TAG}
 ARG ncpus=1
 
 ADD clean-layer.sh  /tmp/clean-layer.sh
 
 # Cuda support
 COPY --from=nvidia /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/
-COPY --from=nvidia /etc/apt/sources.list.d/nvidia-ml.list /etc/apt/sources.list.d/
 COPY --from=nvidia /etc/apt/trusted.gpg /etc/apt/trusted.gpg.d/cuda.gpg
 
-ENV CUDA_MAJOR_VERSION=10
-ENV CUDA_MINOR_VERSION=2
-ENV CUDA_PATCH_VERSION=89
+ENV CUDA_MAJOR_VERSION=11
+ENV CUDA_MINOR_VERSION=4
+ENV CUDA_PATCH_VERSION=2
 ENV CUDA_VERSION=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.$CUDA_PATCH_VERSION
-ENV CUDA_PKG_VERSION=$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION=$CUDA_VERSION-1
-ENV CUDNN_VERSION=7.6.5.32
-ENV CUBLAS_VERSION=10.2.2.89
+ENV CUDA_PKG_VERSION=$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION
+ENV CUDNN_VERSION=8.2.4.15
 LABEL com.nvidia.volumes.needed="nvidia_driver"
 LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
 LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
@@ -39,17 +37,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
       cuda-nvml-dev-$CUDA_PKG_VERSION \
       cuda-minimal-build-$CUDA_PKG_VERSION \
       cuda-command-line-tools-$CUDA_PKG_VERSION \
-      libcudnn7=$CUDNN_VERSION-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \
-      libcudnn7-dev=$CUDNN_VERSION-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \
-      libcublas10=$CUBLAS_VERSION-1 \
-      libcublas-dev=$CUBLAS_VERSION-1 \
-      libnccl2=2.5.6-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \
-      libnccl-dev=2.5.6-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \
-    ln -s /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION /usr/local/cuda && \
-    ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
-    # TODO: remove this hack when we move past tensorflow 2.3
-    # https://github.com/tensorflow/tensorflow/issues/38578#issuecomment-760175854
-    ln -sf /usr/local/cuda/lib64/libcudart.so.10.2 /usr/local/cuda/lib64/libcudart.so.10.1 && \
+      libcudnn8=$CUDNN_VERSION-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \
+      libcudnn8-dev=$CUDNN_VERSION-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \
+      libcublas-$CUDA_PKG_VERSION \
+      libcublas-dev-$CUDA_PKG_VERSION \
+      libnccl2=2.11.4-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \
+      libnccl-dev=2.11.4-1+cuda$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \
     /tmp/clean-layer.sh
 
 ENV CUDA_HOME=/usr/local/cuda
@@ -62,7 +55,7 @@ ENV CUDA_HOME=/usr/local/cuda
 ADD ldpaths $R_HOME/etc/ldpaths
 
 # Install tensorflow with GPU support
-RUN R -e 'keras::install_keras(tensorflow = "2.3-gpu")' && \
+RUN R -e 'keras::install_keras(tensorflow = "2.6-gpu")' && \
     rm -rf /tmp/tensorflow_gpu && \
     /tmp/clean-layer.sh
 
@@ -77,8 +70,9 @@ RUN CPATH=/usr/local/cuda/targets/x86_64-linux/include install2.r --error --ncpu
 
 # Torch: install the full package upfront otherwise it will be installed on loading the package which doesn't work for kernels
 # without internet (competitions for example). It will detect CUDA and install the proper version.
-# TODO(b/224540778) Unpin Torch.
-RUN R -e 'library(devtools); install_version("torch", version = "0.6.0", ask=FALSE)'
-RUN R -e 'library(torch); install_torch(reinstall = TRUE)'
+# Make Torch think we use CUDA 11.3 (https://github.com/mlverse/torch/issues/807)
+ENV CUDA=11.3
+RUN R -e 'install.packages("torch")'
+RUN R -e 'library(torch); install_torch()'
 
 CMD ["R"]
diff --git a/package_installs.R b/package_installs.R
@@ -52,10 +52,9 @@ install.packages("imager")
 
 # Torch: install the full package upfront otherwise it will be installed on loading the package which doesn't work for kernels
 # without internet (competitions for example).
-# TODO(b/224540778) Unpin Torch.
-install_version("torch", version = "0.6.0", ask=FALSE)
+install.packages("torch")
 library(torch)
-install_torch(reinstall = TRUE)
+install_torch()
 
 # The R Keras package must be reinstalled after installing it in the python virtualenv.
-install_version("keras", version = "2.3.0.0", ask=FALSE)
+install_version("keras", version = "2.6.0.0", ask=FALSE)
diff --git a/test b/test
@@ -87,8 +87,12 @@ docker kill jupyter_test_r && docker rm jupyter_test_r
 docker run --rm --name=papermill_test_r --read-only --net=none \
     "$IMAGE_TAG" python -c 'import sys;import papermill as pm; print(pm.__version__)'
 
+
+# TF_FORCE_GPU_ALLOW_GROWTH is to prevent tensorflow from allocating the totality of a GPU memory.
+# https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory/55541385#55541385
 docker run --rm -t --net=none \
     -e HOME=/tmp \
+    -e TF_FORCE_GPU_ALLOW_GROWTH=true \
     -v $PWD:/input:ro -v /tmp/rstats-build/working:/working \
     -v /tmp/rstats-build/tmp:/tmp -v /tmp/rstats-build/devshm:/dev/shm \
     -w=/working \