Skip to content

Commit

Permalink
Add new nvkind target for demo/clusters with multinode GPUs support
Browse files Browse the repository at this point in the history
The new nvkind target assumes you have nvkind installed on your system. If you
do, it will plit all of the GPUs onto individual nodes and spin up a multi-node
kind cluster with 1 GPU per node.

Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed Sep 20, 2024
1 parent 473c918 commit 1e9a437
Show file tree
Hide file tree
Showing 13 changed files with 108 additions and 2 deletions.
4 changes: 2 additions & 2 deletions demo/clusters/kind/install-dra-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ set -o pipefail

source "${CURRENT_DIR}/scripts/common.sh"

kubectl label node "${KIND_CLUSTER_NAME}-worker" --overwrite nvidia.com/dra.kubelet-plugin=true
kubectl label node "${KIND_CLUSTER_NAME}-control-plane" --overwrite nvidia.com/dra.controller=true
kubectl label node -l node-role.x-k8s.io/worker --overwrite nvidia.com/dra.kubelet-plugin=true
kubectl label node -l node-role.x-k8s.io/control-plane --overwrite nvidia.com/dra.controller=true

helm upgrade -i --create-namespace --namespace nvidia-dra-driver nvidia ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \
Expand Down
4 changes: 4 additions & 0 deletions demo/clusters/kind/scripts/kind-cluster-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ containerdConfigPatches:
enable_cdi = true
nodes:
- role: control-plane
labels:
node-role.x-k8s.io/control-plane: ""
kubeadmConfigPatches:
- |
kind: ClusterConfiguration
Expand All @@ -43,6 +45,8 @@ nodes:
kubeletExtraArgs:
v: "1"
- role: worker
labels:
node-role.x-k8s.io/worker: ""
kubeadmConfigPatches:
- |
kind: JoinConfiguration
Expand Down
1 change: 1 addition & 0 deletions demo/clusters/nvkind/build-dra-driver.sh
1 change: 1 addition & 0 deletions demo/clusters/nvkind/create-cluster.sh
1 change: 1 addition & 0 deletions demo/clusters/nvkind/delete-cluster.sh
1 change: 1 addition & 0 deletions demo/clusters/nvkind/install-dra-driver.sh
1 change: 1 addition & 0 deletions demo/clusters/nvkind/scripts/build-driver-image.sh
1 change: 1 addition & 0 deletions demo/clusters/nvkind/scripts/build-kind-image.sh
1 change: 1 addition & 0 deletions demo/clusters/nvkind/scripts/common.sh
30 changes: 30 additions & 0 deletions demo/clusters/nvkind/scripts/create-kind-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

# Copyright 2023 The Kubernetes Authors.
# Copyright 2023 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# A reference to the current directory where this script is located
CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"

set -ex
set -o pipefail

source "${CURRENT_DIR}/common.sh"

nvkind cluster create \
--retain \
--name "${KIND_CLUSTER_NAME}" \
--image "${KIND_IMAGE}" \
--config-template "${KIND_CLUSTER_CONFIG_PATH}"
1 change: 1 addition & 0 deletions demo/clusters/nvkind/scripts/delete-kind-cluster.sh
63 changes: 63 additions & 0 deletions demo/clusters/nvkind/scripts/kind-cluster-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright 2023 The Kubernetes Authors.
# Copyright 2023 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
featureGates:
DynamicResourceAllocation: true
containerdConfigPatches:
# Enable CDI as described in
# https://tags.cncf.io/container-device-interface#containerd-configuration
- |-
[plugins."io.containerd.grpc.v1.cri"]
enable_cdi = true
nodes:
- role: control-plane
labels:
node-role.x-k8s.io/control-plane: ""
kubeadmConfigPatches:
- |
kind: ClusterConfiguration
apiServer:
extraArgs:
runtime-config: "resource.k8s.io/v1alpha3=true"
scheduler:
extraArgs:
v: "1"
controllerManager:
extraArgs:
v: "1"
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
v: "1"
{{- range $gpu := until numGPUs }}
- role: worker
labels:
node-role.x-k8s.io/worker: ""
kubeadmConfigPatches:
- |
kind: JoinConfiguration
nodeRegistration:
kubeletExtraArgs:
v: "1"
extraMounts:
# We inject all NVIDIA GPUs using the nvidia-container-runtime.
# This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set
# in `/etc/nvidia-container-runtime/config.toml`
- hostPath: /dev/null
containerPath: /var/run/nvidia-container-devices/cdi/runtime.nvidia.com/gpu/{{ $gpu }}
{{- end }}

0 comments on commit 1e9a437

Please sign in to comment.