Skip to content

Commit

Permalink
Rename PodGroupPolicy and MLPolicy APIs
Browse files Browse the repository at this point in the history
Signed-off-by: Andrey Velichkevich <[email protected]>
  • Loading branch information
andreyvelich committed Aug 26, 2024
1 parent c28a166 commit 06e7653
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 60 deletions.
60 changes: 33 additions & 27 deletions pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@ type TrainingRuntimeList struct {
type TrainingRuntimeSpec struct {

// Configuration for the model training with ML-specific parameters.
MLSpec *MLSpec `json:"mlSpec,omitempty"`
MLPolicy *MLPolicy `json:"mlPolicy,omitempty"`

// Configuration for the PodGroup to enable gang-scheduling via supported plugins.
PodGroupSpec *PodGroupSpec `json:"podGroupSpec,omitempty"`
PodGroupPolicy *PodGroupPolicy `json:"podGroupPolicy,omitempty"`

// JobSet template which will be used by TrainJob.
Template JobSetTemplateSpec `json:"template"`
Expand All @@ -101,51 +101,57 @@ type JobSetTemplateSpec struct {
Spec jobsetv1alpha2.JobSetSpec `json:"spec,omitempty"`
}

// PodGroupSpec represents a PodGroup configuration to enable gang-scheduling.
type PodGroupSpec struct {
// Plugin for the gang-scheduling.
Plugin GangSchedulerPlugin `json:"plugin"`
// PodGroupPolicy represents a PodGroup configuration for gang-scheduling.
type PodGroupPolicy struct {

// Time threshold to schedule PodGroup for gang-scheduling.
ScheduleTimeoutSeconds *string `json:"scheduleTimeoutSeconds,omitempty"`
// Configuration for gang-scheduling using various plugins.
PodGroupPolicySource `json:",inline"`
}

// GangSchedulerPlugin represents one of the supported gang-scheduling plugins.
type GangSchedulerPlugin string

const (
// Volcano plugin for gang-scheduling.
GangSchedulerPluginVolcano GangSchedulerPlugin = "volcano"
// PodGroupPolicySource represents supported plugins for gang-scheduling.
// Only one of its members may be specified.
type PodGroupPolicySource struct {

// Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling.
GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling"
)
Coscheduling *CoschedulingPodGroupPolicySource `json:"coscheduling,omitempty"`

// TODO (andreyvelich): Add support for Volcano gang-scheduler.
}

// CoschedulingPodGroupPolicySource represents configuration for coscheduling plugin.
type CoschedulingPodGroupPolicySource struct {

// Time threshold to schedule PodGroup for gang-scheduling.
// If the scheduling timeout is equal to 0, the default value is used.
// Defaults to 60 seconds.
ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
}

// MLSpec represents configuration for the model trining with ML-specific parameters.
type MLSpec struct {
// MLPolicy represents configuration for the model trining with ML-specific parameters.
type MLPolicy struct {

// Number of training nodes.
// Defaults to 1.
NumNodes *int32 `json:"numNodes,omitempty"`

// Configuration for the runtime-specific parameters, such as Torch or MPI.
// One of the following spec sources can be set.
MLSpecSource `json:",inline"`
// Only one of its members may be specified.
MLPolicySource `json:",inline"`
}

// MLPolicySource represents the runtime-specific configuration for various technologies.
// One of the following specs can be set.
type MLSpecSource struct {
type MLPolicySource struct {

// Configuration for the PyTorch runtime.
Torch *TorchMLSpecSource `json:"torch,omitempty"`
Torch *TorchMLPolicySource `json:"torch,omitempty"`

// Configuration for the MPI Runtime.
MPI *MPIMLSpecSource `json:"mpi,omitempty"`
MPI *MPIMLPolicySource `json:"mpi,omitempty"`
}

// TorchMLSpecSource represents a PyTorch runtime configuration.
type TorchMLSpecSource struct {
// TorchMLPolicySource represents a PyTorch runtime configuration.
type TorchMLPolicySource struct {
// Number of processes per node.
// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
// Supported values: `auto`, `cpu`, `gpu`, or int value.
Expand Down Expand Up @@ -179,8 +185,8 @@ type TorchElasticPolicy struct {
Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
}

// MPIMLSpecSource represents a MPI runtime configuration.
type MPIMLSpecSource struct {
// MPIMLPolicySource represents a MPI runtime configuration.
type MPIMLPolicySource struct {
// Number of processes per node.
// This value is equal to the number of slots for each node in the hostfile.
NumProcPerNode *int32 `json:"numProcPerNode,omitempty"`
Expand Down
102 changes: 69 additions & 33 deletions pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 06e7653

Please sign in to comment.