diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh deleted file mode 100755 index 93c106c6e6..0000000000 --- a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# This will stop execution when any command will have non-zero status. -set -e - -VM_NAME="pytorch-dino-7d" -ZONE_NAME="us-west1-b" -ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/dino" -TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh" - -cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" - -source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH - diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg deleted file mode 100644 index 3dc0813d14..0000000000 --- a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg +++ /dev/null @@ -1,4 +0,0 @@ -build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh" - -# 2 hours timeout. -timeout_mins: 60 diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh new file mode 100755 index 0000000000..3f74686208 --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +VM_NAME="pytorch-dino-7d" +ZONE_NAME="us-west1-b" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v1_12/dino" +TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh" +PYTORCH_VERSION="v1_12" + +cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" + +source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/continuous.cfg new file mode 100644 index 0000000000..184cdb9442 --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/continuous.cfg @@ -0,0 +1,18 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh" + +# 2 hours timeout. +timeout_mins: 60 diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh new file mode 100755 index 0000000000..9f1b11a953 --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +VM_NAME="pytorch2-dino-7d" +ZONE_NAME="us-west1-a" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino" +TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh" +PYTORCH_VERSION="v2" + +cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" + +source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/continuous.cfg new file mode 100644 index 0000000000..8d3d851ddb --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/continuous.cfg @@ -0,0 +1,18 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh" + +# 2 hours timeout. +timeout_mins: 60 diff --git a/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh b/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh index 0e7e46c906..3c0b9b7be0 100755 --- a/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh +++ b/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh @@ -27,6 +27,11 @@ ZONE_NAME=$2 ARTIFACTS_BUCKET_PATH=$3 # Path of test script relative to $HOME inside test VM. TEST_SCRIPT_PATH=$4 +# pytorch version +PYTORCH_VERSION=$5 +MACHINE_TYPE="a2-highgpu-2g" +ACCELERATOR="count=2,type=nvidia-tesla-a100" +RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus" function initialize_ssh_key () { echo "Delete existing ssh keys " @@ -55,26 +60,35 @@ function delete_existing_vm_and_create_new () { echo "Wait for 30 seconds for old VM to be deleted" sleep 30s + # NVIDIA A100 40GB GPU type machine is currently unavailable due to global shortage. + # Create NVIDIA L4 machines which are available on us-west1-1 zone. + if [ $PYTORCH_VERSION == "v2" ]; + then + MACHINE_TYPE="g2-standard-24" + ACCELERATOR="count=2,type=nvidia-l4" + RESERVATION="projects/$GCP_PROJECT/reservations/pytorch2-ai-ml-tests" + fi + echo "Creating VM $VM_NAME in zone $ZONE_NAME" # The below command creates VM using the reservation 'ai-ml-tests' sudo gcloud compute instances create $VM_NAME \ - --project=$GCP_PROJECT\ - --zone=$ZONE_NAME \ - --machine-type=a2-highgpu-2g \ - --network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \ - --metadata=enable-osconfig=TRUE,enable-oslogin=true \ - --maintenance-policy=TERMINATE \ - --provisioning-model=STANDARD \ - --service-account=927584127901-compute@developer.gserviceaccount.com \ - --scopes=https://www.googleapis.com/auth/cloud-platform \ - --accelerator=count=2,type=nvidia-tesla-a100 \ - --create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20230616,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \ - --no-shielded-secure-boot \ - --shielded-vtpm \ - --shielded-integrity-monitoring \ - --labels=goog-ops-agent-policy=v2-x86-template-1-0-0,goog-ec-src=vm_add-gcloud \ - --reservation-affinity=specific \ - --reservation=projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus + --project=$GCP_PROJECT\ + --zone=$ZONE_NAME \ + --machine-type=$MACHINE_TYPE \ + --network-interface=network-tier=PREMIUM,nic-type=GVNIC,stack-type=IPV4_ONLY,subnet=default \ + --metadata=enable-osconfig=TRUE,enable-oslogin=true \ + --maintenance-policy=TERMINATE \ + --provisioning-model=STANDARD \ + --service-account=927584127901-compute@developer.gserviceaccount.com \ + --scopes=https://www.googleapis.com/auth/cloud-platform \ + --accelerator=$ACCELERATOR \ + --create-disk=auto-delete=yes,boot=yes,device-name=$VM_NAME,image=projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20230616,mode=rw,size=150,type=projects/$GCP_PROJECT/zones/$ZONE_NAME/diskTypes/pd-balanced \ + --no-shielded-secure-boot \ + --shielded-vtpm \ + --shielded-integrity-monitoring \ + --labels=goog-ops-agent-policy=v2-x86-template-1-0-0,goog-ec-src=vm_add-gcloud \ + --reservation-affinity=specific \ + --reservation=$RESERVATION echo "Wait for 30 seconds for new VM to be initialised" sleep 30s @@ -132,7 +146,7 @@ exit_status=0 # Transitions: # START to START: If model run is not triggerred due to some error. # START to RUNNING: If model is successfully triggerred on GPU. This state is -# changed by setup_host.sh that runs inside docker container of test VM. +# changed by setup_host.sh that runs inside docker container of test VM. if [ $current_status == "START" ]; then echo "Update commit Id for the run" diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/README-usage.md b/perfmetrics/scripts/ml_tests/pytorch/README-usage.md similarity index 93% rename from perfmetrics/scripts/ml_tests/pytorch/dino/README-usage.md rename to perfmetrics/scripts/ml_tests/pytorch/README-usage.md index 1364c7097f..014eef20c2 100644 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/README-usage.md +++ b/perfmetrics/scripts/ml_tests/pytorch/README-usage.md @@ -16,7 +16,7 @@ curl, ca-certificates, lsb-release etc. This script contains the instruction to install gcsfuse, mount GCS-bucket using gcsfuse, and finally runs the pytorch dino model. -### File: perfmetrics/scripts/continuous_test/pytorch/dino/build.sh +### File: perfmetrics/scripts/continuous_test/pytorch/{v1_12 or v2}/dino/build.sh This is the parent script of the above two scripts. Firstly, it sets-up the host machine after that it creates the docker-image and finally it runs the container with the inststructions written in the setup_container.sh. @@ -40,6 +40,6 @@ log.txt - Contains the model learning parameter value after each epoch. variable - with current working directory. 3. Create a folder named "github" and clone the gcsfuse repo in that. 4. Run the below script in the current working directory: - **source github/gcsfuse/permetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh** + **source github/gcsfuse/permetrics/scripts/continuous_test/ml_tests/pytorch/{v1_12 or v2}/dino/build.sh** 5. The above command first setups the host and then start running the model inside container. diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh b/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh deleted file mode 100755 index 3c710f2e14..0000000000 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_host_and_run_model.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# This will stop execution when any command will have non-zero status. -set -e - -cd "$HOME/github/gcsfuse/perfmetrics/scripts" - -echo "Setting up the machine with Docker and Nvidia Driver" -source ml_tests/setup_host.sh - -cd "$HOME/github/gcsfuse" -echo "Building docker image containing all pytorch libraries..." -sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/dino/Dockerfile --tag pytorch-gcsfuse - -mkdir -p container_artifacts - -echo "Running the docker image build in the previous step..." -sudo docker run --gpus all --name=pytorch_automation_container --privileged -d -v $HOME/github/gcsfuse/container_artifacts:/pytorch_dino/run_artifacts:rw,rshared \ ---shm-size=128g pytorch-gcsfuse:latest - -# Setup the log_rotation. -source perfmetrics/scripts/ml_tests/setup_log_rotation.sh $HOME/github/gcsfuse/container_artifacts/gcsfuse.log - -# Wait for the script completion as well as logs output. -sudo docker logs -f pytorch_automation_container diff --git a/perfmetrics/scripts/ml_tests/pytorch/run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/run_container.sh new file mode 100644 index 0000000000..f0f787a9c9 --- /dev/null +++ b/perfmetrics/scripts/ml_tests/pytorch/run_container.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e +# pytorch version (e.g. v1_12, v2) +PYTORCH_VESRION=$1 +cd "$HOME/github/gcsfuse" +echo "Building docker image containing all pytorch libraries..." +sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${PYTORCH_VESRION}/dino/Dockerfile --tag pytorch-gcsfuse + +mkdir -p container_artifacts + +echo "Running the docker image build in the previous step..." +sudo docker run --gpus all --name=pytorch_automation_container --privileged -d -v $HOME/github/gcsfuse/container_artifacts:/pytorch_dino/run_artifacts:rw,rshared \ +--shm-size=128g pytorch-gcsfuse:latest + +# Setup the log_rotation. +source perfmetrics/scripts/ml_tests/setup_log_rotation.sh $HOME/github/gcsfuse/container_artifacts/gcsfuse.log + +# Wait for the script completion as well as logs output. +sudo docker logs -f pytorch_automation_container diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh b/perfmetrics/scripts/ml_tests/pytorch/run_model.sh similarity index 80% rename from perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh rename to perfmetrics/scripts/ml_tests/pytorch/run_model.sh index 1952c1515d..674878bd5e 100755 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh +++ b/perfmetrics/scripts/ml_tests/pytorch/run_model.sh @@ -1,4 +1,19 @@ #!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PYTORCH_VESRION=$1 # Install golang wget -O go_tar.tar.gz https://go.dev/dl/go1.21.3.linux-amd64.tar.gz -q @@ -39,7 +54,7 @@ def pil_loader(path: str) -> Image.Image: return rgb_img " > bypassed_code.py -folder_file="/opt/conda/lib/python3.7/site-packages/torchvision/datasets/folder.py" +folder_file="/opt/conda/lib/python3.10/site-packages/torchvision/datasets/folder.py" x=$(grep -n "def pil_loader(path: str) -> Image.Image:" $folder_file | cut -f1 -d ':') y=$(grep -n "def accimage_loader(path: str) -> Any:" $folder_file | cut -f1 -d ':') y=$((y - 2)) @@ -51,7 +66,7 @@ sed -i "$x"'r bypassed_code.py' $folder_file # nproc_per_node - by downloading the model in single thread environment. python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")' -ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/dino" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${PYTORCH_VESRION}/dino" echo "Update status file" echo "RUNNING" > status.txt gsutil cp status.txt $ARTIFACTS_BUCKET_PATH/ @@ -66,7 +81,7 @@ gsutil cp start_time.txt $ARTIFACTS_BUCKET_PATH/ # We need to run it in foreground mode to make the container running. echo "Running the pytorch dino model..." experiment=dino_experiment - python3 -m torch.distributed.launch \ + torchrun \ --nproc_per_node=2 dino/main_dino.py \ --arch vit_small \ --num_workers 20 \ diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/Dockerfile b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile similarity index 86% rename from perfmetrics/scripts/ml_tests/pytorch/dino/Dockerfile rename to perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile index 8f0a34ed85..d96e780324 100644 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/Dockerfile +++ b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile @@ -24,9 +24,11 @@ WORKDIR "/pytorch_dino/" RUN git clone "https://github.com/facebookresearch/dino" -COPY perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh ./ +COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./ RUN mkdir -p "run_artifacts" RUN mkdir -p "gcsfuse_data" -ENTRYPOINT ["/bin/bash", "-c", "./setup_container.sh"] +ENV PYTORCH_VERSION="v1_12" + +ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}] diff --git a/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh new file mode 100755 index 0000000000..c4a4580b64 --- /dev/null +++ b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +cd "$HOME/github/gcsfuse/perfmetrics/scripts" + +echo "Setting up the machine with Docker and Nvidia Driver" +# Driver version for A100 GPUs is 450.172.01 +DRIVER_VERSION="450.172.01" +source ml_tests/setup_host.sh $DRIVER_VERSION + +PYTORCH_VERSION="v1_12" +source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION diff --git a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile new file mode 100644 index 0000000000..e06a713020 --- /dev/null +++ b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile @@ -0,0 +1,46 @@ +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Image with gcsfuse installed and its package (.deb) +FROM gcr.io/deeplearning-platform-release/pytorch-gpu.2-0.py310 + +# Allow non-root users to specify the allow_other or allow_root mount options +RUN echo "user_allow_other" > /etc/fuse.conf + +RUN pip3 install timm + +WORKDIR "/pytorch_dino/" + +RUN git clone "https://github.com/facebookresearch/dino" +# (TulsiShah) TODO: The current docker image does not support the dino model with compile mode. +# We can unblock the below code whenever the docker image supports the same to run. + +# WORKDIR "/pytorch_dino/dino" +# RUN echo '[remote "origin"]' >> .git/config +# RUN echo ' fetch = +refs/pull/262/head:refs/remotes/origin/pr/262' >> .git/config +# +# RUN git fetch origin +# RUN git diff origin/main origin/pr/262 > diff.patch +# RUN git apply diff.patch +# +# WORKDIR "/pytorch_dino/" + +COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./ + +RUN mkdir -p "run_artifacts" +RUN mkdir -p "gcsfuse_data" + +ENV PYTORCH_VERSION="v2" + +ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"] diff --git a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh new file mode 100755 index 0000000000..f2edd6f886 --- /dev/null +++ b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2023 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http:#www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +cd "$HOME/github/gcsfuse/perfmetrics/scripts" + +echo "Setting up the machine with Docker and Nvidia Driver" +# Driver version for L4 GPUs is 525.60.13 +DRIVER_VERSION="525.60.13" +source ml_tests/setup_host.sh $DRIVER_VERSION + +PYTORCH_VERSION="v2" +source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION diff --git a/perfmetrics/scripts/ml_tests/setup_host.sh b/perfmetrics/scripts/ml_tests/setup_host.sh index 464da67666..e89c77e5c2 100755 --- a/perfmetrics/scripts/ml_tests/setup_host.sh +++ b/perfmetrics/scripts/ml_tests/setup_host.sh @@ -2,6 +2,8 @@ # This file installs docker engine and nvidia driver and nvidia container tool # necessary for running dlc container on the vm +DRIVER_VERSION=$1 + # Install Ops-agent to get the memory and processes' related data on VM console. curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh @@ -32,7 +34,6 @@ sudo apt-get install docker-ce docker-ce-cli containerd.io docker-compose-plugin echo "Installing driver..." sudo apt update && sudo apt install -y build-essential BASE_URL=https://us.download.nvidia.com/tesla -DRIVER_VERSION=450.172.01 sudo curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run sudo sh NVIDIA-Linux-x86_64-$DRIVER_VERSION.run -s