From cddfef5450188b38b120167b6310339b27a363ca Mon Sep 17 00:00:00 2001 From: Matteo Bunino <48362942+matbun@users.noreply.github.com> Date: Thu, 17 Oct 2024 09:36:42 +0200 Subject: [PATCH] Refactor --- env-files/torch/create_container_env.sh | 8 +++----- ...{multistage.Dockerfile => slim.Dockerfile} | 20 +++++++++++++------ 2 files changed, 17 insertions(+), 11 deletions(-) rename env-files/torch/{multistage.Dockerfile => slim.Dockerfile} (65%) diff --git a/env-files/torch/create_container_env.sh b/env-files/torch/create_container_env.sh index f51cc4db..4e88d45d 100644 --- a/env-files/torch/create_container_env.sh +++ b/env-files/torch/create_container_env.sh @@ -8,9 +8,8 @@ if [ -z "$1" ]; then exit 2 fi -if [ "$1" == "23.09-py3" ]; then - # Tested for torch==2.1.0 - +if [[ "$1" == "23.09-py3" || "$1" == "24.09-py3" ]]; then + pip install --no-cache-dir --upgrade pip # pip install --no-cache-dir lightning torchmetrics wheel ray ray[tune] @@ -31,13 +30,12 @@ if [ "$1" == "23.09-py3" ]; then pip install --no-cache-dir deepspeed || exit 1 - - # TODO adapt to multi-stage build # # fix .triton/autotune/Fp16Matmul_2d_kernel.pickle bug # pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)" # line=$(cat -n /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py | grep os.rename | awk '{print $1}' | head -n 1) # sed -i "${line}s|^|#|" /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py + # Horovod # compiler vars export LDSHARED="$CC -shared" && diff --git a/env-files/torch/multistage.Dockerfile b/env-files/torch/slim.Dockerfile similarity index 65% rename from env-files/torch/multistage.Dockerfile rename to env-files/torch/slim.Dockerfile index cb3f5620..6c00a731 100644 --- a/env-files/torch/multistage.Dockerfile +++ b/env-files/torch/slim.Dockerfile @@ -1,3 +1,5 @@ +# Dockerfile for slim itwinai image. MPI, CUDA and other need to be mounted from the host machine. + ARG IMG_TAG=24.09-py3 # 23.09-py3: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-09.html @@ -10,12 +12,15 @@ ARG IMG_TAG WORKDIR /app +#RUN apt-get update && apt-get install -y python3.10-venv && rm -rf /var/lib/apt/lists/* + # Virtual env -ENV VIRTUAL_ENV=/opt/venv \ - PATH="/opt/venv/bin:$PATH" +#ENV VIRTUAL_ENV=/opt/venv \ +# PATH="/opt/venv/bin:$PATH" # User python3.10 explicitly to force /opt/venv/bin/python to point to python3.10. Needed to link # /usr/local/bin/python3.10 (in the app image) to /usr/bin/python3.10 (in the builder image) -RUN python3.10 -m venv /opt/venv +#RUN python3.10 -m venv /opt/venv && \ +# pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu124 # https://github.com/mpi4py/mpi4py/pull/431 RUN env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py @@ -35,12 +40,15 @@ LABEL org.opencontainers.image.licenses=MIT LABEL maintainer="Matteo Bunino - matteo.bunino@cern.ch" # Copy virtual env -COPY --from=build /opt/venv /opt/venv +#COPY --from=build /opt/venv /opt/venv +#Need to copy /usr/local/lib/python3.10/dist-packages (6GB)? +COPY --from=build /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages +ENV PYTHONPATH="/usr/local/lib/python3.10/dist-packages:PYTHONPATH" # Link /usr/local/bin/python3.10 (in the app image) to /usr/bin/python3.10 (in the builder image) -RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3.10 +#RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3.10 # Activate the virtualenv in the container # See here for more information: # https://pythonspeed.com/articles/multi-stage-docker-python/ -ENV PATH="/opt/venv/bin:$PATH" +#ENV PATH="/opt/venv/bin:$PATH"