diff --git a/.github/workflows/image_build.yml b/.github/workflows/image_build.yml index a84c71c22..baa4e4d83 100644 --- a/.github/workflows/image_build.yml +++ b/.github/workflows/image_build.yml @@ -12,7 +12,7 @@ env: jobs: image: - if: github.repository == 'kubeagi/arcadia' + if: github.repository == 'bjwswang/arcadia' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/deploy/charts/arcadia/Chart.yaml b/deploy/charts/arcadia/Chart.yaml index b401ef1ba..8673a65be 100644 --- a/deploy/charts/arcadia/Chart.yaml +++ b/deploy/charts/arcadia/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: arcadia description: A Helm chart(Also a KubeBB Component) for KubeAGI Arcadia type: application -version: 0.3.23 +version: 0.3.24 appVersion: "0.2.1" keywords: diff --git a/deploy/charts/arcadia/templates/config.yaml b/deploy/charts/arcadia/templates/config.yaml index d4a0c5cc9..830742bd1 100644 --- a/deploy/charts/arcadia/templates/config.yaml +++ b/deploy/charts/arcadia/templates/config.yaml @@ -84,19 +84,19 @@ data: - id: 4 name: "效率" nameEn: "Efficiency" - - id: 4 + - id: 5 name: "人物扮演" nameEn: "Character Play" - - id: 5 + - id: 6 name: "游戏" nameEn: "Game" - - id: 6 + - id: 7 name: "生活" nameEn: "Life" - - id: 7 + - id: 8 name: "情感" nameEn: "Emotion" - - id: 8 + - id: 9 name: "动漫" nameEn: "Anime" {{- end }} diff --git a/pkg/config/config_type.go b/pkg/config/config_type.go index e54103e73..0c3744d3a 100644 --- a/pkg/config/config_type.go +++ b/pkg/config/config_type.go @@ -112,7 +112,7 @@ func (rayCluster RayCluster) GetPythonVersion() string { return rayCluster.PythonVersion } -// DefaultRayCluster which can be used for vllm worker as local ray cluster +// DefaultRayCluster which can be used for vllm worker as local ray cluster which can only utilize single node gpus func DefaultRayCluster() RayCluster { return RayCluster{ Name: "default", diff --git a/pkg/worker/runner.go b/pkg/worker/runner.go index 26602c3cd..62080b8a2 100644 --- a/pkg/worker/runner.go +++ b/pkg/worker/runner.go @@ -29,6 +29,7 @@ import ( arcadiav1alpha1 "github.com/kubeagi/arcadia/api/base/v1alpha1" "github.com/kubeagi/arcadia/pkg/config" + "k8s.io/apimachinery/pkg/api/resource" ) const ( @@ -196,8 +197,12 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp additionalEnvs := []corev1.EnvVar{} // configure ray cluster + resources := runner.w.Spec.Resources + gpus := runner.NumberOfGPUs() + // default ray cluster which can only utilize gpus on single nodes rayCluster := config.DefaultRayCluster() for _, envItem := range runner.w.Spec.AdditionalEnvs { + // using existing ray cluster if envItem.Name == "RAY_CLUSTER_INDEX" { externalRayClusterIndex, _ := strconv.Atoi(envItem.Value) rayClusters, err := config.GetRayClusters(ctx, runner.c) @@ -208,6 +213,8 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp return nil, fmt.Errorf("no ray clusters configured") } rayCluster = rayClusters[externalRayClusterIndex] + // Hardcoded directly requested gpu to 1 if using existing ray cluster + resources.Limits[ResourceNvidiaGPU] = resource.MustParse("1") } // set gpu memory utilization @@ -224,6 +231,8 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp extraAgrs = envItem.Value } } + klog.V(5).Infof("run worker with raycluster:\n %s", rayCluster.String()) + // set ray configurations into additional environments additionalEnvs = append(additionalEnvs, corev1.EnvVar{ @@ -237,8 +246,7 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp Value: rayCluster.GetPythonVersion(), }) // Set gpu number to the number of GPUs in the worker's resource - additionalEnvs = append(additionalEnvs, corev1.EnvVar{Name: "NUMBER_GPUS", Value: runner.NumberOfGPUs()}) - klog.V(5).Infof("run worker with raycluster:\n %s", rayCluster.String()) + additionalEnvs = append(additionalEnvs, corev1.EnvVar{Name: "NUMBER_GPUS", Value: gpus}) modelFileDir := fmt.Sprintf("%s/%s", defaultModelMountPath, model.Name) // --enforce-eager to disable cupy @@ -287,7 +295,7 @@ func (runner *RunnerFastchatVLLM) Build(ctx context.Context, model *arcadiav1alp // mount volume to /dev/shm to avoid Bus error {Name: "models", MountPath: defaultShmMountPath}, }, - Resources: runner.w.Spec.Resources, + Resources: resources, } container.Env = append(container.Env, additionalEnvs...) return container, nil