interTwin-eu · matbun · Oct 16, 2024 · Oct 16, 2024 · Oct 17, 2024
diff --git a/env-files/tensorflow/Dockerfile b/env-files/tensorflow/Dockerfile
@@ -6,7 +6,12 @@ ARG IMG_TAG=24.08-tf2-py3
 
 FROM nvcr.io/nvidia/tensorflow:${IMG_TAG}
 
-WORKDIR /usr/src/app
+LABEL org.opencontainers.image.source=https://github.com/interTwin-eu/itwinai
+LABEL org.opencontainers.image.description="Base itwinai image with tensorflow dependencies and CUDA drivers"
+LABEL org.opencontainers.image.licenses=MIT
+LABEL maintainer="Matteo Bunino - [email protected]"
+
+WORKDIR /app
 
 # Install itwinai
 COPY pyproject.toml ./
@@ -15,15 +20,10 @@ COPY env-files/tensorflow/create_container_env.sh ./
 RUN bash create_container_env.sh
 
 # Create non-root user
-RUN groupadd -g 10001 jovyan \
-    && useradd -m -u 10000 -g jovyan jovyan \
-    && chown -R jovyan:jovyan /usr/src/app
-USER jovyan:jovyan
+RUN groupadd -g 10001 jarvis \
+    && useradd -m -u 10000 -g jarvis jarvis \
+    && chown -R jarvis:jarvis /app
+USER jarvis:jarvis
 
 # ENTRYPOINT [ "/bin/sh" ]
 # CMD [  ]
-
-LABEL org.opencontainers.image.source=https://github.com/interTwin-eu/itwinai
-LABEL org.opencontainers.image.description="Base itwinai image with tensorflow dependencies and CUDA drivers"
-LABEL org.opencontainers.image.licenses=MIT
-LABEL maintainer="Matteo Bunino - [email protected]"
diff --git a/env-files/torch/Dockerfile b/env-files/torch/Dockerfile
@@ -5,10 +5,15 @@ ARG IMG_TAG=23.09-py3
 
 FROM nvcr.io/nvidia/pytorch:${IMG_TAG}
 
+LABEL org.opencontainers.image.source=https://github.com/interTwin-eu/itwinai
+LABEL org.opencontainers.image.description="Base itwinai image with torch dependencies and CUDA drivers"
+LABEL org.opencontainers.image.licenses=MIT
+LABEL maintainer="Matteo Bunino - [email protected]"
+
 # https://stackoverflow.com/a/56748289
 ARG IMG_TAG
 
-WORKDIR /usr/src/app
+WORKDIR /app
 
 # https://github.com/mpi4py/mpi4py/pull/431
 RUN env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py
@@ -20,12 +25,7 @@ COPY env-files/torch/create_container_env.sh ./
 RUN bash create_container_env.sh ${IMG_TAG}
 
 # Create non-root user
-RUN groupadd -g 10001 jovyan \
-    && useradd -m -u 10000 -g jovyan jovyan \
-    && chown -R jovyan:jovyan /usr/src/app
-USER jovyan:jovyan
-
-LABEL org.opencontainers.image.source=https://github.com/interTwin-eu/itwinai
-LABEL org.opencontainers.image.description="Base itwinai image with torch dependencies and CUDA drivers"
-LABEL org.opencontainers.image.licenses=MIT
-LABEL maintainer="Matteo Bunino - [email protected]"
+RUN groupadd -g 10001 jarvis \
+    && useradd -m -u 10000 -g jarvis jarvis \
+    && chown -R jarvis:jarvis /app
+USER jarvis:jarvis
diff --git a/env-files/torch/create_container_env.sh b/env-files/torch/create_container_env.sh
@@ -8,9 +8,8 @@ if [ -z "$1" ]; then
     exit 2
 fi
 
-if [ "$1" == "23.09-py3" ]; then
-    # Tested for torch==2.1.0
-
+if [[ "$1" == "23.09-py3" || "$1" == "24.09-py3" ]]; then
+
     pip install --no-cache-dir --upgrade pip
     # pip install --no-cache-dir lightning torchmetrics wheel ray ray[tune]
 
@@ -30,11 +29,12 @@ if [ "$1" == "23.09-py3" ]; then
     export DS_BUILD_TRANSFORMER_INFERENCE=1
 
     pip install --no-cache-dir deepspeed || exit 1
+
+    # # fix .triton/autotune/Fp16Matmul_2d_kernel.pickle bug
+    # pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)"
+    # line=$(cat -n /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py | grep os.rename | awk '{print $1}' | head -n 1)
+    # sed -i "${line}s|^|#|" /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py 
 
-    # fix .triton/autotune/Fp16Matmul_2d_kernel.pickle bug
-    pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)"
-    line=$(cat -n /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py | grep os.rename | awk '{print $1}' | head -n 1)
-    sed -i "${line}s|^|#|" /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py 
 
     # Horovod
     # compiler vars

diff --git a/env-files/torch/slim.Dockerfile b/env-files/torch/slim.Dockerfile
@@ -0,0 +1,54 @@
+# Dockerfile for slim itwinai image. MPI, CUDA and other need to be mounted from the host machine.
+
+ARG IMG_TAG=24.09-py3
+
+# 23.09-py3: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-09.html
+# 24.04-py3: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html
+
+FROM nvcr.io/nvidia/pytorch:${IMG_TAG} as build
+
+# https://stackoverflow.com/a/56748289
+ARG IMG_TAG
+
+WORKDIR /app
+
+#RUN apt-get update && apt-get install -y python3.10-venv && rm -rf /var/lib/apt/lists/*
+
+# Virtual env
+#ENV VIRTUAL_ENV=/opt/venv \
+#    PATH="/opt/venv/bin:$PATH"
+# User python3.10 explicitly to force /opt/venv/bin/python to point to python3.10. Needed to link
+# /usr/local/bin/python3.10 (in the app image) to /usr/bin/python3.10 (in the builder image)
+#RUN python3.10 -m venv /opt/venv && \
+#    pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu124
+
+# https://github.com/mpi4py/mpi4py/pull/431
+RUN env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py
+
+# Install itwinai
+COPY pyproject.toml ./
+COPY src ./
+COPY env-files/torch/create_container_env.sh ./
+RUN bash create_container_env.sh ${IMG_TAG}
+
+# App image
+FROM python:3.10-slim
+
+LABEL org.opencontainers.image.source=https://github.com/interTwin-eu/itwinai
+LABEL org.opencontainers.image.description="Base itwinai image with torch dependencies and CUDA drivers"
+LABEL org.opencontainers.image.licenses=MIT
+LABEL maintainer="Matteo Bunino - [email protected]"
+
+# Copy virtual env
+#COPY --from=build /opt/venv /opt/venv
+#Need to copy /usr/local/lib/python3.10/dist-packages (6GB)?
+COPY --from=build /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
+ENV PYTHONPATH="/usr/local/lib/python3.10/dist-packages:PYTHONPATH"
+
+# Link /usr/local/bin/python3.10 (in the app image) to /usr/bin/python3.10 (in the builder image)
+#RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3.10
+
+# Activate the virtualenv in the container
+# See here for more information:
+# https://pythonspeed.com/articles/multi-stage-docker-python/
+#ENV PATH="/opt/venv/bin:$PATH"