Skip to content

Commit

Permalink
Feat: add support for distributed serving type (#1187)
Browse files Browse the repository at this point in the history
* Feat: support distributed serving type

Signed-off-by: 林联辉 <[email protected]>

* Fix command check

Signed-off-by: 林联辉 <[email protected]>

* Fix lint problem

Signed-off-by: 林联辉 <[email protected]>

---------

Signed-off-by: 林联辉 <[email protected]>
Co-authored-by: 林联辉 <[email protected]>
  • Loading branch information
linnlh and 林联辉 authored Nov 7, 2024
1 parent 70278ce commit 68b71f9
Show file tree
Hide file tree
Showing 32 changed files with 2,717 additions and 4 deletions.
21 changes: 21 additions & 0 deletions charts/distributed-serving/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj
3 changes: 3 additions & 0 deletions charts/distributed-serving/CHANGLOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### 0.1.0

* init distributed-serving chart
5 changes: 5 additions & 0 deletions charts/distributed-serving/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
appVersion: "1.0"
description: A Helm chart for distributed-serving
name: distributed-serving
version: 0.1.0
32 changes: 32 additions & 0 deletions charts/distributed-serving/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "distributed-serving.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "distributed-serving.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}

{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "distributed-serving.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
63 changes: 63 additions & 0 deletions charts/distributed-serving/templates/configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
{{- $releaseName := .Release.Name }}
{{- $namespace := .Release.Namespace }}
{{- $workerNum := .Values.workers -}}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ $releaseName }}-cm
labels:
app: {{ template "distributed-serving.name" $ }}
chart: {{ template "distributed-serving.chart" $ }}
release: {{ $releaseName }}
heritage: {{ .Release.Service }}
createdBy: "DistributedServing"
data:
{{- range $replica := until (int .Values.replicas) }}
hostfile-{{ $replica }}: |-
{{ $releaseName }}.{{ $releaseName }}-{{ $replica }}.{{ $namespace }}
{{- range $i := until (int $workerNum) }}
{{ $releaseName }}.{{ $releaseName }}-{{ $replica }}-{{ $i }}.{{ $namespace }}
{{- end }}
{{- end }}
master.rayInit: |-
#!/bin/bash
ray_port=6379
ray_init_timeout=300
ray_cluster_size=$WORLD_SIZE
master_command=$1
ray start --head --port=$ray_port
for (( i=0; i < $ray_init_timeout; i+=5 )); do
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
if [ $active_nodes -eq $ray_cluster_size ]; then
echo "All ray workers are active and the ray cluster is initialized successfully."
$master_command
exit 0
fi
echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
sleep 5s;
done
echo "Waiting for all ray workers to be active timed out."
exit 1
worker.rayInit: |-
#!/bin/bash
ray_port=6379
ray_init_timeout=300
ray_address=$MASTER_ADDR
worker_command=$1
for (( i=0; i < $ray_init_timeout; i+=5 )); do
ray start --address=$ray_address:$ray_port
if [ $? -eq 0 ]; then
echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
$worker_command
exit 0
fi
echo "Waiting until the ray worker is active..."
sleep 5s;
done
echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
exit 1
Loading

0 comments on commit 68b71f9

Please sign in to comment.