Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Go to 1.21.5 and k8s.io/* to 0.29.1 #74

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion .github/workflows/golang.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
- name: Install Go
uses: actions/setup-go@v5
with:
go-version: '1.20'
go-version: '1.21.5'
- run: make test
build:
runs-on: ubuntu-latest
Expand Down
20 changes: 15 additions & 5 deletions cmd/nvidia-dra-controller/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,17 @@ func (d driver) GetClaimParameters(ctx context.Context, claim *resourcev1.Resour
return nil, fmt.Errorf("unknown ResourceClaim.ParametersRef.Kind: %v", claim.Spec.ParametersRef.Kind)
}

func (d driver) Allocate(ctx context.Context, claim *resourcev1.ResourceClaim, claimParameters interface{}, class *resourcev1.ResourceClass, classParameters interface{}, selectedNode string) (*resourcev1.AllocationResult, error) {
func (d driver) Allocate(ctx context.Context, cas []*controller.ClaimAllocation, selectedNode string) {
// In production version of the driver the common operations for every
// d.allocate looped call should be done prior this loop, and can be reused
// for every d.allocate() looped call.
// E.g.: selectedNode=="" check, client stup and CRD fetching.
for _, ca := range cas {
ca.Allocation, ca.Error = d.allocate(ctx, ca.Claim, ca.ClaimParameters, ca.Class, ca.ClassParameters, selectedNode)
}
}

func (d driver) allocate(ctx context.Context, claim *resourcev1.ResourceClaim, claimParameters interface{}, class *resourcev1.ResourceClass, classParameters interface{}, selectedNode string) (*resourcev1.AllocationResult, error) {
if selectedNode == "" {
return nil, fmt.Errorf("TODO: immediate allocations not yet supported")
}
Expand All @@ -126,6 +136,10 @@ func (d driver) Allocate(ctx context.Context, claim *resourcev1.ResourceClaim, c
return nil, fmt.Errorf("error retrieving node specific Gpu CRD: %w", err)
}

if crd.Status != nascrd.NodeAllocationStateStatusReady {
return nil, fmt.Errorf("NodeAllocationStateStatus: %v", crd.Status)
}

if crd.Spec.AllocatedClaims == nil {
crd.Spec.AllocatedClaims = make(map[string]nascrd.AllocatedDevices)
}
Expand All @@ -134,10 +148,6 @@ func (d driver) Allocate(ctx context.Context, claim *resourcev1.ResourceClaim, c
return buildAllocationResult(selectedNode, true), nil
}

if crd.Status != nascrd.NodeAllocationStateStatusReady {
return nil, fmt.Errorf("NodeAllocationStateStatus: %v", crd.Status)
}

var onSuccess OnSuccessCallback
classParams, ok := classParameters.(*gpucrd.DeviceClassParametersSpec)
if !ok {
Expand Down
63 changes: 40 additions & 23 deletions cmd/nvidia-dra-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/util/retry"
"k8s.io/klog/v2"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha2"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't think we could make this change until we also made the kind changes. My expectation was that this PR would pull in the controller changes from #14 but not the kubelet-plugin changes. I'd also prefer to see this as a sparate commit, not bundled in the same commit as the go.mod changes.


nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1"
nasclient "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1/client"
Expand Down Expand Up @@ -99,39 +99,56 @@ func (d *driver) Shutdown(ctx context.Context) error {
})
}

func (d *driver) NodePrepareResource(ctx context.Context, req *drapbv1.NodePrepareResourceRequest) (*drapbv1.NodePrepareResourceResponse, error) {
func (d *driver) NodePrepareResources(ctx context.Context, req *drapbv1.NodePrepareResourcesRequest) (*drapbv1.NodePrepareResourcesResponse, error) {

klog.Infof("NodePrepareResource is called: number of claims: %d", len(req.Claims))
preparedResources := &drapbv1.NodePrepareResourcesResponse{Claims: map[string]*drapbv1.NodePrepareResourceResponse{}}

// In production version some common operations of d.nodeUnprepareResources
// should be done outside of the loop, for instance updating the CR could
// be done once after all HW was prepared.
for _, claim := range req.Claims {
preparedResources.Claims[claim.Uid] = d.nodePrepareResource(ctx, claim)
}

return preparedResources, nil
}

func (d *driver) nodePrepareResource(ctx context.Context, claim *drapbv1.Claim) *drapbv1.NodePrepareResourceResponse {
d.Lock()
defer d.Unlock()

klog.Infof("NodePrepareResource is called: request: %+v", req)

isPrepared, prepared, err := d.IsPrepared(ctx, req.ClaimUid)
isPrepared, prepared, err := d.isPrepared(ctx, claim.Uid)
if err != nil {
return nil, fmt.Errorf("error checking if claim is already prepared: %w", err)
return &drapbv1.NodePrepareResourceResponse{
Error: fmt.Sprintf("error checking if claim is already prepared: %v", err),
}
}

if isPrepared {
klog.Infof("Returning cached devices for claim '%v': %s", req.ClaimUid, prepared)
return &drapbv1.NodePrepareResourceResponse{CdiDevices: prepared}, nil
klog.Infof("Returning cached devices for claim '%v': %s", claim.Uid, prepared)
return &drapbv1.NodePrepareResourceResponse{CDIDevices: prepared}
}

prepared, err = d.Prepare(ctx, req.ClaimUid)
prepared, err = d.prepare(ctx, claim.Uid)
if err != nil {
return nil, fmt.Errorf("error preparing devices for claim %v: %w", req.ClaimUid, err)
return &drapbv1.NodePrepareResourceResponse{
Error: fmt.Sprintf("error preparing devices for claim %v: %v", claim.Uid, err),
}
}

klog.Infof("Returning newly prepared devices for claim '%v': %s", req.ClaimUid, prepared)
return &drapbv1.NodePrepareResourceResponse{CdiDevices: prepared}, nil
klog.Infof("Returning newly prepared devices for claim '%v': %s", claim.Uid, prepared)
return &drapbv1.NodePrepareResourceResponse{CDIDevices: prepared}
}

func (d *driver) NodeUnprepareResource(ctx context.Context, req *drapbv1.NodeUnprepareResourceRequest) (*drapbv1.NodeUnprepareResourceResponse, error) {
func (d *driver) NodeUnprepareResources(ctx context.Context, req *drapbv1.NodeUnprepareResourcesRequest) (*drapbv1.NodeUnprepareResourcesResponse, error) {
// We don't upprepare as part of NodeUnprepareResource, we do it
// asynchronously when the claims themselves are deleted and the
// AllocatedClaim has been removed.
return &drapbv1.NodeUnprepareResourceResponse{}, nil
return &drapbv1.NodeUnprepareResourcesResponse{}, nil
}

func (d *driver) IsPrepared(ctx context.Context, claimUID string) (bool, []string, error) {
func (d *driver) isPrepared(ctx context.Context, claimUID string) (bool, []string, error) {
err := d.nasclient.Get(ctx)
if err != nil {
return false, nil, err
Expand All @@ -142,7 +159,7 @@ func (d *driver) IsPrepared(ctx context.Context, claimUID string) (bool, []strin
return false, nil, nil
}

func (d *driver) Prepare(ctx context.Context, claimUID string) ([]string, error) {
func (d *driver) prepare(ctx context.Context, claimUID string) ([]string, error) {
var err error
var prepared []string
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
Expand All @@ -169,7 +186,7 @@ func (d *driver) Prepare(ctx context.Context, claimUID string) ([]string, error)
return prepared, nil
}

func (d *driver) Unprepare(ctx context.Context, claimUID string) error {
func (d *driver) unprepare(ctx context.Context, claimUID string) error {
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
err := d.nasclient.Get(ctx)
if err != nil {
Expand Down Expand Up @@ -198,12 +215,12 @@ func (d *driver) CleanupStaleStateContinuously(ctx context.Context) {
for {
resourceVersion, err := d.cleanupStaleStateOnce(ctx)
if err != nil {
klog.Errorf("Error cleaning up stale claim state: %w", err)
klog.Errorf("Error cleaning up stale claim state: %v", err)
}

err = d.cleanupStaleStateContinuously(ctx, resourceVersion, err)
if err != nil {
klog.Errorf("Error cleaning up stale claim state: %w", err)
klog.Errorf("Error cleaning up stale claim state: %v", err)
time.Sleep(CleanupTimeoutSecondsOnError * time.Second)
}
}
Expand Down Expand Up @@ -279,7 +296,7 @@ func (d *driver) cleanupStaleState(ctx context.Context, nas *nascrd.NodeAllocati
go func() {
count := 0
for err := range caErrors {
klog.Errorf("Error cleaning up claim allocations: %w", err)
klog.Errorf("Error cleaning up claim allocations: %v", err)
count++
}
errorCounts <- count
Expand All @@ -290,7 +307,7 @@ func (d *driver) cleanupStaleState(ctx context.Context, nas *nascrd.NodeAllocati
go func() {
count := 0
for err := range cdiErrors {
klog.Errorf("Error cleaning up CDI files: %w", err)
klog.Errorf("Error cleaning up CDI files: %v", err)
count++
}
errorCounts <- count
Expand All @@ -301,7 +318,7 @@ func (d *driver) cleanupStaleState(ctx context.Context, nas *nascrd.NodeAllocati
go func() {
count := 0
for err := range mpsErrors {
klog.Errorf("Error cleaning up MPS control daemon artifacts: %w", err)
klog.Errorf("Error cleaning up MPS control daemon artifacts: %v", err)
count++
}
errorCounts <- count
Expand Down Expand Up @@ -329,7 +346,7 @@ func (d *driver) cleanupClaimAllocations(ctx context.Context, nas *nascrd.NodeAl
go func(claimUID string) {
defer wg.Done()
klog.Infof("Attempting to unprepare resources for claim %v", claimUID)
err := d.Unprepare(ctx, claimUID)
err := d.unprepare(ctx, claimUID)
if err != nil {
errors <- fmt.Errorf("error unpreparing resources for claim %v: %w", claimUID, err)
return
Expand Down
2 changes: 1 addition & 1 deletion cmd/nvidia-dra-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ func StartPlugin(ctx context.Context, config *Config) error {

err = driver.Shutdown(ctx)
if err != nil {
klog.Errorf("Unable to cleanly shutdown driver: %w", err)
klog.Errorf("Unable to cleanly shutdown driver: %v", err)
}

return nil
Expand Down
25 changes: 23 additions & 2 deletions deployments/container/Dockerfile.ubi8
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,32 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG GOLANG_VERSION=1.20.4
ARG GOLANG_VERSION=1.21.5
ARG CUDA_IMAGE=cuda
ARG CUDA_VERSION=11.8.0
ARG BASE_DIST=ubi8
FROM golang:${GOLANG_VERSION} as build
FROM nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} as build

RUN yum install -y \
wget make git gcc \
&& \
rm -rf /var/cache/yum/*

ARG GOLANG_VERSION=x.x.x
RUN set -eux; \
\
arch="$(uname -m)"; \
case "${arch##*-}" in \
x86_64 | amd64) ARCH='amd64' ;; \
ppc64el | ppc64le) ARCH='ppc64le' ;; \
aarch64) ARCH='arm64' ;; \
*) echo "unsupported architecture" ; exit 1 ;; \
esac; \
wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \
| tar -C /usr/local -xz

ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH

WORKDIR /build
COPY . .
Expand Down
25 changes: 23 additions & 2 deletions deployments/container/Dockerfile.ubuntu
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,32 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG GOLANG_VERSION=1.20.4
ARG GOLANG_VERSION=1.21.5
ARG CUDA_IMAGE=cuda
ARG CUDA_VERSION=11.8.0
ARG BASE_DIST=ubuntu20.04
FROM golang:${GOLANG_VERSION} as build
FROM nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} as build

RUN apt-get update && \
apt-get install -y wget make git gcc \
&& \
rm -rf /var/lib/apt/lists/*

ARG GOLANG_VERSION=x.x.x
RUN set -eux; \
\
arch="$(uname -m)"; \
case "${arch##*-}" in \
x86_64 | amd64) ARCH='amd64' ;; \
ppc64el | ppc64le) ARCH='ppc64le' ;; \
aarch64) ARCH='arm64' ;; \
*) echo "unsupported architecture" ; exit 1 ;; \
esac; \
wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \
| tar -C /usr/local -xz

ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH

WORKDIR /build
COPY . .
Expand Down
Loading