fix: hardcode resouce request of gpus to 1 if utilize a existing ray …

…cluster Signed-off-by: bjwswang <[email protected]>
bjwswang · Mar 27, 2024 · fbf1d5f · fbf1d5f
1 parent 7f8cf97
commit fbf1d5f
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 11 deletions.
diff --git a/.github/workflows/image_build.yml b/.github/workflows/image_build.yml
@@ -12,7 +12,7 @@ env:
 
 jobs:
   image:
-    if: github.repository == 'kubeagi/arcadia'
+    if: github.repository == 'bjwswang/arcadia'
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4

diff --git a/deploy/charts/arcadia/Chart.yaml b/deploy/charts/arcadia/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: arcadia
 description: A Helm chart(Also a KubeBB Component) for KubeAGI Arcadia
 type: application
-version: 0.3.23
+version: 0.3.24
 appVersion: "0.2.1"
 
 keywords:

diff --git a/deploy/charts/arcadia/templates/config.yaml b/deploy/charts/arcadia/templates/config.yaml
@@ -84,19 +84,19 @@ data:
       - id: 4
         name: "效率"
         nameEn: "Efficiency"
-      - id: 4
+      - id: 5
         name: "人物扮演"
         nameEn: "Character Play"
-      - id: 5
+      - id: 6
         name: "游戏"
         nameEn: "Game"
-      - id: 6
+      - id: 7
         name: "生活"
         nameEn: "Life"
-      - id: 7
+      - id: 8
         name: "情感"
         nameEn: "Emotion"
-      - id: 8
+      - id: 9
         name: "动漫"
         nameEn: "Anime"
 {{- end }}

diff --git a/pkg/config/config_type.go b/pkg/config/config_type.go
@@ -112,7 +112,7 @@ func (rayCluster RayCluster) GetPythonVersion() string {
 	return rayCluster.PythonVersion
 }
 
-// DefaultRayCluster which can be used for vllm worker as local ray cluster
+// DefaultRayCluster which can be used for vllm worker as local ray cluster which can only utilize single node gpus
 func DefaultRayCluster() RayCluster {
 	return RayCluster{
 		Name:          "default",

diff --git a/pkg/worker/runner.go b/pkg/worker/runner.go
@@ -29,6 +29,7 @@ import (
 
 	arcadiav1alpha1 "github.com/kubeagi/arcadia/api/base/v1alpha1"
 	"github.com/kubeagi/arcadia/pkg/config"
+	"k8s.io/apimachinery/pkg/api/resource"
 )
 
 const (
@@ -196,8 +197,12 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp
 	additionalEnvs := []corev1.EnvVar{}
 
 	// configure ray cluster
+	resources := runner.w.Spec.Resources
+	gpus := runner.NumberOfGPUs()
+	// default ray cluster which can only utilize gpus on single nodes
 	rayCluster := config.DefaultRayCluster()
 	for _, envItem := range runner.w.Spec.AdditionalEnvs {
+		// using existing ray cluster
 		if envItem.Name == "RAY_CLUSTER_INDEX" {
 			externalRayClusterIndex, _ := strconv.Atoi(envItem.Value)
 			rayClusters, err := config.GetRayClusters(ctx, runner.c)
@@ -208,6 +213,8 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp
 				return nil, fmt.Errorf("no ray clusters configured")
 			}
 			rayCluster = rayClusters[externalRayClusterIndex]
+			// Hardcoded directly requested gpu to 1 if using existing ray cluster
+			resources.Limits[ResourceNvidiaGPU] = resource.MustParse("1")
 		}
 
 		// set gpu memory utilization
@@ -224,6 +231,8 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp
 			extraAgrs = envItem.Value
 		}
 	}
+	klog.V(5).Infof("run worker with raycluster:\n %s", rayCluster.String())
+
 	// set ray configurations into additional environments
 	additionalEnvs = append(additionalEnvs,
 		corev1.EnvVar{
@@ -237,8 +246,7 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp
 			Value: rayCluster.GetPythonVersion(),
 		})
 	// Set gpu number to the number of GPUs in the worker's resource
-	additionalEnvs = append(additionalEnvs, corev1.EnvVar{Name: "NUMBER_GPUS", Value: runner.NumberOfGPUs()})
-	klog.V(5).Infof("run worker with raycluster:\n %s", rayCluster.String())
+	additionalEnvs = append(additionalEnvs, corev1.EnvVar{Name: "NUMBER_GPUS", Value: gpus})
 
 	modelFileDir := fmt.Sprintf("%s/%s", defaultModelMountPath, model.Name)
 	// --enforce-eager to disable cupy
@@ -287,7 +295,7 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp
 			// mount volume to /dev/shm to avoid Bus error
 			{Name: "models", MountPath: defaultShmMountPath},
 		},
-		Resources: runner.w.Spec.Resources,
+		Resources: resources,
 	}
 	container.Env = append(container.Env, additionalEnvs...)
 	return container, nil