Skip to content

Commit

Permalink
Merge pull request kubeagi#400 from bjwswang/chore
Browse files Browse the repository at this point in the history
feat: able to offline worker instead of remove
  • Loading branch information
bjwswang authored Dec 20, 2023
2 parents c83a6b1 + 04f4d1a commit d78944e
Show file tree
Hide file tree
Showing 19 changed files with 431 additions and 157 deletions.
5 changes: 5 additions & 0 deletions api/base/v1alpha1/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ func (s *ConditionedStatus) IsReady() bool {
return s.GetCondition(TypeReady).Status == corev1.ConditionTrue
}

func (s *ConditionedStatus) IsOffline() bool {
readyCond := s.GetCondition(TypeReady)
return readyCond.Status == corev1.ConditionFalse && readyCond.Reason == "Offline"
}

func (s *ConditionedStatus) WaitingCompleteCondition() []Condition {
return []Condition{{
Type: TypeReady,
Expand Down
25 changes: 25 additions & 0 deletions api/base/v1alpha1/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ const (

const (
LabelWorkerType = Group + "/worker-type"

// Labels for worker's Pod
WorkerPodSelectorLabel = "app.kubernetes.io/name"
WorkerPodLabel = Group + "/worker"
)

func DefaultWorkerType() WorkerType {
Expand Down Expand Up @@ -105,6 +109,27 @@ func (worker Worker) ReadyCondition() Condition {
}
}

func (worker Worker) OfflineCondition() Condition {
currCon := worker.Status.GetCondition(TypeReady)
// return current condition if condition not changed
if currCon.Status == corev1.ConditionTrue && currCon.Reason == "Offline" {
return currCon
}
// keep original LastSuccessfulTime if have
lastSuccessfulTime := metav1.Now()
if currCon.LastSuccessfulTime.IsZero() {
lastSuccessfulTime = currCon.LastSuccessfulTime
}
return Condition{
Type: TypeReady,
Status: corev1.ConditionFalse,
Reason: "Offline",
Message: "Work is offline",
LastTransitionTime: metav1.Now(),
LastSuccessfulTime: lastSuccessfulTime,
}
}

func (worker Worker) ErrorCondition(msg string) Condition {
currCon := worker.Status.GetCondition(TypeReady)
// return current condition if condition not changed
Expand Down
5 changes: 5 additions & 0 deletions api/base/v1alpha1/worker_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ type WorkerSpec struct {
// Model this worker wants to use
Model *TypedObjectReference `json:"model"`

// Replicas of this worker instance(1 by default)
// +kubebuilder:default=1
// +kubebuilder:validation:Maximum=1
Replicas *int32 `json:"replicas,omitempty"`

// Resource request&limits including
// - CPU or GPU
// - Memory
Expand Down
5 changes: 5 additions & 0 deletions api/base/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

77 changes: 76 additions & 1 deletion apiserver/graph/generated/generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion apiserver/graph/generated/models_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions apiserver/graph/schema/worker.gql
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ query listWorkers($input: ListWorkerInput!){
}
api
modelTypes
replicas
resources {
cpu
memory
Expand Down Expand Up @@ -64,6 +65,7 @@ query getWorker($name: String!, $namespace: String!) {
}
api
modelTypes
replicas
resources {
cpu
memory
Expand Down Expand Up @@ -98,6 +100,7 @@ mutation createWorker($input: CreateWorkerInput!) {
}
api
modelTypes
replicas
resources {
cpu
memory
Expand All @@ -124,6 +127,7 @@ mutation updateWorker($input: UpdateWorkerInput) {
status
message
updateTimestamp
replicas
resources {
cpu
memory
Expand Down
9 changes: 9 additions & 0 deletions apiserver/graph/schema/worker.graphqls
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ type Worker {
"""
modelTypes: String!

"""
worker运行的Pod副本数量
规则: 默认为1,最大值为1
规则: 为0时,即下线
"""
replicas: String

"""
worker运行所需的资源
规则: 必填
Expand Down Expand Up @@ -170,6 +177,8 @@ input UpdateWorkerInput {
"""
type: String

replicas: String

"""
worker运行所需的资源
"""
Expand Down
19 changes: 19 additions & 0 deletions apiserver/pkg/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ package worker

import (
"context"
"fmt"
"sort"
"strconv"
"strings"

"github.com/pkg/errors"
Expand Down Expand Up @@ -60,6 +62,12 @@ func worker2model(ctx context.Context, c dynamic.Interface, obj *unstructured.Un
// Unknown,Pending ,Running ,Error
status := common.GetObjStatus(worker)

// replicas
var replicas string
if worker.Spec.Replicas != nil {
replicas = fmt.Sprint(worker.Spec.Replicas)
}

// resources
cpu := worker.Spec.Resources.Limits[v1.ResourceCPU]
cpuStr := cpu.String()
Expand Down Expand Up @@ -91,6 +99,7 @@ func worker2model(ctx context.Context, c dynamic.Interface, obj *unstructured.Un
Status: &status,
CreationTimestamp: &creationtimestamp,
UpdateTimestamp: &updateTime,
Replicas: &replicas,
Resources: resources,
ModelTypes: "unknown",
API: &api,
Expand Down Expand Up @@ -216,6 +225,16 @@ func UpdateWorker(ctx context.Context, c dynamic.Interface, input *generated.Upd
}
}

// replicas
if input.Replicas != nil {
replicas, err := strconv.ParseInt(*input.Replicas, 10, 32)
if err != nil {
return nil, errors.Wrap(err, "Invalid replicas")
}
replicasInt32 := int32(replicas)
worker.Spec.Replicas = &replicasInt32
}

// resources
if input.Resources != nil {
// cpu & memory
Expand Down
6 changes: 6 additions & 0 deletions config/crd/bases/arcadia.kubeagi.k8s.com.cn_workers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ spec:
- kind
- name
type: object
replicas:
default: 1
description: Replicas of this worker instance(1 by default)
format: int32
maximum: 1
type: integer
resources:
description: Resource request&limits including - CPU or GPU - Memory
properties:
Expand Down
1 change: 1 addition & 0 deletions config/samples/arcadia_v1alpha1_worker_baichuan2-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ spec:
model:
kind: "Models"
name: "baichuan2-7b-chat"
replicas: 1
resources:
limits:
nvidia.com/gpu: "1" # request 1 GPU
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ metadata:
namespace: arcadia
spec:
type: "fastchat"
replicas: 1
model:
kind: "Models"
name: "bge-large-zh-v1.5"
14 changes: 14 additions & 0 deletions config/samples/arcadia_v1alpha1_worker_qwen-7b-chat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: arcadia.kubeagi.k8s.com.cn/v1alpha1
kind: Worker
metadata:
name: qwen-7b-chat
namespace: kubeagi-system
spec:
type: "fastchat"
model:
kind: "Models"
name: "qwen-7b-chat"
replicas: 1
resources:
limits:
nvidia.com/gpu: "1" # request 1 GPU
Loading

0 comments on commit d78944e

Please sign in to comment.