From 53420fa5cc73943bae154a955eba47917c2820fd Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Wed, 6 Nov 2024 15:10:15 +0100 Subject: [PATCH] Update all demo scripts for use on GKE with a k8s 1.31 alpha cluster Signed-off-by: Kevin Klues --- demo/clusters/gke/create-cluster.sh | 16 ++++++++++------ demo/clusters/gke/install-dra-driver.sh | 3 ++- demo/specs/quickstart/gpu-test-mps.yaml | 4 ++++ demo/specs/quickstart/gpu-test1.yaml | 8 ++++++++ demo/specs/quickstart/gpu-test2.yaml | 4 ++++ demo/specs/quickstart/gpu-test3.yaml | 8 ++++++++ demo/specs/quickstart/gpu-test4.yaml | 4 ++++ demo/specs/quickstart/gpu-test5.yaml | 4 ++++ demo/specs/quickstart/gpu-test6.yaml | 4 ++++ 9 files changed, 48 insertions(+), 7 deletions(-) diff --git a/demo/clusters/gke/create-cluster.sh b/demo/clusters/gke/create-cluster.sh index fd876add..4240b80a 100755 --- a/demo/clusters/gke/create-cluster.sh +++ b/demo/clusters/gke/create-cluster.sh @@ -35,6 +35,7 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME") NETWORK_NAME="${DRIVER_NAME}-net" CLUSTER_NAME="${DRIVER_NAME}-cluster" +NODE_VERSION="1.31.1-gke.2105000" ## Create the Network for the cluster gcloud compute networks create "${NETWORK_NAME}" \ @@ -52,8 +53,10 @@ gcloud container clusters create "${CLUSTER_NAME}" \ --no-enable-autorepair \ --no-enable-autoupgrade \ --region us-west1 \ + --num-nodes "1" \ --network "${NETWORK_NAME}" \ - --node-labels=nvidia.com/dra.controller=true + --cluster-version "${NODE_VERSION}" \ + --node-version "${NODE_VERSION}" # Create t4 node pool gcloud beta container node-pools create "pool-1" \ @@ -61,7 +64,7 @@ gcloud beta container node-pools create "pool-1" \ --project "${PROJECT_NAME}" \ --cluster "${CLUSTER_NAME}" \ --region "us-west1" \ - --node-version "1.27.3-gke.100" \ + --node-version "${NODE_VERSION}" \ --machine-type "n1-standard-8" \ --accelerator "type=nvidia-tesla-t4,count=1" \ --image-type "UBUNTU_CONTAINERD" \ @@ -79,7 +82,7 @@ gcloud beta container node-pools create "pool-1" \ --max-surge-upgrade 1 \ --max-unavailable-upgrade 0 \ --node-locations "us-west1-a" \ - --node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true + --node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true # Create v100 node pool gcloud beta container node-pools create "pool-2" \ @@ -87,7 +90,7 @@ gcloud beta container node-pools create "pool-2" \ --project "${PROJECT_NAME}" \ --cluster "${CLUSTER_NAME}" \ --region "us-west1" \ - --node-version "1.27.3-gke.100" \ + --node-version "${NODE_VERSION}" \ --machine-type "n1-standard-8" \ --accelerator "type=nvidia-tesla-v100,count=1" \ --image-type "UBUNTU_CONTAINERD" \ @@ -105,7 +108,7 @@ gcloud beta container node-pools create "pool-2" \ --max-surge-upgrade 1 \ --max-unavailable-upgrade 0 \ --node-locations "us-west1-a" \ - --node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true + --node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true ## Allow the GPU nodes access to the internet gcloud compute routers create ${NETWORK_NAME}-nat-router \ @@ -126,10 +129,11 @@ gcloud compute routers nats create "${NETWORK_NAME}-nat-config" \ gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="us-west1" ## Launch the nvidia-driver-installer daemonset to install the GPU drivers on any GPU nodes that come online: +kubectl label node --overwrite -l nvidia.com/gpu.present=true cloud.google.com/gke-gpu-driver-version- kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml ## Create the nvidia namespace kubectl create namespace nvidia ## Deploy a custom daemonset that prepares a node for use with DRA -kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver/456d097feb452cca1351817bab2ccd0782e96c9f/demo/prepare-gke-nodes-for-dra.yaml +kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver/3498c9a91cb594af94c9e8d65177b131e380e116/demo/prepare-gke-nodes-for-dra.yaml diff --git a/demo/clusters/gke/install-dra-driver.sh b/demo/clusters/gke/install-dra-driver.sh index 5e3ecb15..f48aab36 100755 --- a/demo/clusters/gke/install-dra-driver.sh +++ b/demo/clusters/gke/install-dra-driver.sh @@ -27,7 +27,7 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME") : ${IMAGE_REGISTRY:=ghcr.io/nvidia} : ${IMAGE_NAME:=${DRIVER_NAME}} -: ${IMAGE_TAG:=9323da2d-ubuntu20.04} +: ${IMAGE_TAG:=32805fec-ubi8} helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \ --set image.repository=${IMAGE_REGISTRY}/${IMAGE_NAME} \ @@ -35,6 +35,7 @@ helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJEC --set image.pullPolicy=Always \ --set controller.priorityClassName="" \ --set kubeletPlugin.priorityClassName="" \ + --set deviceClasses="{gpu,mig}" \ --set nvidiaDriverRoot="/opt/nvidia" \ --set kubeletPlugin.tolerations[0].key=nvidia.com/gpu \ --set kubeletPlugin.tolerations[0].operator=Exists \ diff --git a/demo/specs/quickstart/gpu-test-mps.yaml b/demo/specs/quickstart/gpu-test-mps.yaml index 25112520..8fd1411f 100644 --- a/demo/specs/quickstart/gpu-test-mps.yaml +++ b/demo/specs/quickstart/gpu-test-mps.yaml @@ -57,3 +57,7 @@ spec: resourceClaims: - name: shared-gpu resourceClaimTemplateName: shared-gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" diff --git a/demo/specs/quickstart/gpu-test1.yaml b/demo/specs/quickstart/gpu-test1.yaml index e06c1b1d..87817dab 100644 --- a/demo/specs/quickstart/gpu-test1.yaml +++ b/demo/specs/quickstart/gpu-test1.yaml @@ -40,6 +40,10 @@ spec: resourceClaims: - name: gpu resourceClaimTemplateName: single-gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" --- apiVersion: v1 @@ -61,3 +65,7 @@ spec: resourceClaims: - name: gpu resourceClaimTemplateName: single-gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" diff --git a/demo/specs/quickstart/gpu-test2.yaml b/demo/specs/quickstart/gpu-test2.yaml index 626d0103..b050624d 100644 --- a/demo/specs/quickstart/gpu-test2.yaml +++ b/demo/specs/quickstart/gpu-test2.yaml @@ -45,3 +45,7 @@ spec: resourceClaims: - name: shared-gpu resourceClaimTemplateName: single-gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" diff --git a/demo/specs/quickstart/gpu-test3.yaml b/demo/specs/quickstart/gpu-test3.yaml index 03f77e88..d2217ada 100644 --- a/demo/specs/quickstart/gpu-test3.yaml +++ b/demo/specs/quickstart/gpu-test3.yaml @@ -39,6 +39,10 @@ spec: resourceClaims: - name: shared-gpu resourceClaimName: single-gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" --- apiVersion: v1 @@ -60,3 +64,7 @@ spec: resourceClaims: - name: shared-gpu resourceClaimName: single-gpu + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" diff --git a/demo/specs/quickstart/gpu-test4.yaml b/demo/specs/quickstart/gpu-test4.yaml index 4bae78f4..06f5e604 100644 --- a/demo/specs/quickstart/gpu-test4.yaml +++ b/demo/specs/quickstart/gpu-test4.yaml @@ -97,3 +97,7 @@ spec: claims: - name: mig-devices request: mig-3g-20gb + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" diff --git a/demo/specs/quickstart/gpu-test5.yaml b/demo/specs/quickstart/gpu-test5.yaml index 8d6d62d5..ca42d807 100644 --- a/demo/specs/quickstart/gpu-test5.yaml +++ b/demo/specs/quickstart/gpu-test5.yaml @@ -87,3 +87,7 @@ spec: resourceClaims: - name: shared-gpus resourceClaimTemplateName: multiple-gpus + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" diff --git a/demo/specs/quickstart/gpu-test6.yaml b/demo/specs/quickstart/gpu-test6.yaml index a2a654fd..c7d231e2 100644 --- a/demo/specs/quickstart/gpu-test6.yaml +++ b/demo/specs/quickstart/gpu-test6.yaml @@ -70,3 +70,7 @@ spec: resourceClaims: - name: a100 resourceClaimTemplateName: a100 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule"