Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
matbun committed Oct 17, 2024
1 parent 5a46ba3 commit cddfef5
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 11 deletions.
8 changes: 3 additions & 5 deletions env-files/torch/create_container_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ if [ -z "$1" ]; then
exit 2
fi

if [ "$1" == "23.09-py3" ]; then
# Tested for torch==2.1.0

if [[ "$1" == "23.09-py3" || "$1" == "24.09-py3" ]]; then

pip install --no-cache-dir --upgrade pip
# pip install --no-cache-dir lightning torchmetrics wheel ray ray[tune]

Expand All @@ -31,13 +30,12 @@ if [ "$1" == "23.09-py3" ]; then

pip install --no-cache-dir deepspeed || exit 1


# TODO adapt to multi-stage build
# # fix .triton/autotune/Fp16Matmul_2d_kernel.pickle bug
# pver="$(python --version 2>&1 | awk '{print $2}' | cut -f1-2 -d.)"
# line=$(cat -n /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py | grep os.rename | awk '{print $1}' | head -n 1)
# sed -i "${line}s|^|#|" /usr/lib/python${pver}/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py


# Horovod
# compiler vars
export LDSHARED="$CC -shared" &&
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Dockerfile for slim itwinai image. MPI, CUDA and other need to be mounted from the host machine.

ARG IMG_TAG=24.09-py3

# 23.09-py3: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-09.html
Expand All @@ -10,12 +12,15 @@ ARG IMG_TAG

WORKDIR /app

#RUN apt-get update && apt-get install -y python3.10-venv && rm -rf /var/lib/apt/lists/*

# Virtual env
ENV VIRTUAL_ENV=/opt/venv \
PATH="/opt/venv/bin:$PATH"
#ENV VIRTUAL_ENV=/opt/venv \
# PATH="/opt/venv/bin:$PATH"
# User python3.10 explicitly to force /opt/venv/bin/python to point to python3.10. Needed to link
# /usr/local/bin/python3.10 (in the app image) to /usr/bin/python3.10 (in the builder image)
RUN python3.10 -m venv /opt/venv
#RUN python3.10 -m venv /opt/venv && \
# pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu124

# https://github.com/mpi4py/mpi4py/pull/431
RUN env SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py
Expand All @@ -35,12 +40,15 @@ LABEL org.opencontainers.image.licenses=MIT
LABEL maintainer="Matteo Bunino - [email protected]"

# Copy virtual env
COPY --from=build /opt/venv /opt/venv
#COPY --from=build /opt/venv /opt/venv
#Need to copy /usr/local/lib/python3.10/dist-packages (6GB)?
COPY --from=build /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
ENV PYTHONPATH="/usr/local/lib/python3.10/dist-packages:PYTHONPATH"

# Link /usr/local/bin/python3.10 (in the app image) to /usr/bin/python3.10 (in the builder image)
RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3.10
#RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3.10

# Activate the virtualenv in the container
# See here for more information:
# https://pythonspeed.com/articles/multi-stage-docker-python/
ENV PATH="/opt/venv/bin:$PATH"
#ENV PATH="/opt/venv/bin:$PATH"

0 comments on commit cddfef5

Please sign in to comment.