Skip to content

Commit

Permalink
Add default Intel MPI env variables to MPIJob (#1804)
Browse files Browse the repository at this point in the history
Signed-off-by: Tuomas Katila <[email protected]>
Co-authored-by: Yuki Iwai <[email protected]>
  • Loading branch information
tkatila and tenzen-y committed Jun 19, 2023
1 parent e002b8a commit 7383114
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 0 deletions.
21 changes: 21 additions & 0 deletions pkg/controller.v1/mpi/mpijob.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ const (
initContainerCpu = "100m"
initContainerEphStorage = "5Gi"
initContainerMem = "512Mi"
iMPIDefaultBootstrap = "rsh"
)

const (
Expand Down Expand Up @@ -218,6 +219,26 @@ func isGPULauncher(mpiJob *kubeflowv1.MPIJob) bool {
return false
}

// hasIntelMPIBootstrapValues returns the existence of I_MPI_HYDRA_BOOTSTRAP
// and I_MPI_HYDRA_BOOTSTRAP_EXEC values.
// There are also _EXEC_EXTRA_ARGS and _AUTOFORK under the I_MPI_HYDRA_BOOTSTRAP
// prefix but those are not checked on purpose.
func hasIntelMPIBootstrapValues(envs []corev1.EnvVar) (bootstrap, exec bool) {
for _, env := range envs {
if env.Name == "I_MPI_HYDRA_BOOTSTRAP" {
bootstrap = true
} else if env.Name == "I_MPI_HYDRA_BOOTSTRAP_EXEC" {
exec = true
}

if bootstrap && exec {
break
}
}

return bootstrap, exec
}

func defaultReplicaLabels(genericLabels map[string]string, roleLabelVal string) map[string]string {
replicaLabels := map[string]string{}
for k, v := range genericLabels {
Expand Down
19 changes: 19 additions & 0 deletions pkg/controller.v1/mpi/mpijob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,25 @@ func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDelive
})
}

// Add default Intel MPI bootstrap variables if not provided by the user.
bootstrap, exec := hasIntelMPIBootstrapValues(container.Env)
if !bootstrap {
container.Env = append(container.Env,
corev1.EnvVar{
Name: "I_MPI_HYDRA_BOOTSTRAP",
Value: iMPIDefaultBootstrap,
},
)
}
if !exec {
container.Env = append(container.Env,
corev1.EnvVar{
Name: "I_MPI_HYDRA_BOOTSTRAP_EXEC",
Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
},
)
}

container.VolumeMounts = append(container.VolumeMounts,
corev1.VolumeMount{
Name: kubectlVolumeName,
Expand Down
85 changes: 85 additions & 0 deletions pkg/controller.v1/mpi/mpijob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,91 @@ var _ = Describe("MPIJob controller", func() {
})
})

Context("Test launcher's Intel MPI handling", func() {
It("Should create a launcher job with Intel MPI env variables", func() {
By("By creating MPIJobs with and without preset env variables")

testCases := map[string]struct {
envVariables map[string]string
expectedEnvVariables map[string]string
}{
"withoutIMPIValues": {
envVariables: map[string]string{
"X_MPI_HYDRA_BOOTSTRAP": "foo",
},
expectedEnvVariables: map[string]string{
"I_MPI_HYDRA_BOOTSTRAP": iMPIDefaultBootstrap,
"I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
},
},
"withIMPIBootstrap": {
envVariables: map[string]string{
"I_MPI_HYDRA_BOOTSTRAP": "RSH",
},
expectedEnvVariables: map[string]string{
"I_MPI_HYDRA_BOOTSTRAP": "RSH",
"I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
},
},
"withIMPIBootstrapExec": {
envVariables: map[string]string{
"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
},
expectedEnvVariables: map[string]string{
"I_MPI_HYDRA_BOOTSTRAP": iMPIDefaultBootstrap,
"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
},
},
"withIMPIBootstrapAndExec": {
envVariables: map[string]string{
"I_MPI_HYDRA_BOOTSTRAP": "RSH",
"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
},
expectedEnvVariables: map[string]string{
"I_MPI_HYDRA_BOOTSTRAP": "RSH",
"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
},
},
}

for testName, testCase := range testCases {
ctx := context.Background()
startTime := metav1.Now()
completionTime := metav1.Now()

jobName := "test-launcher-creation-" + strings.ToLower(testName)

mpiJob := newMPIJob(jobName, pointer.Int32(1), 1, gpuResourceName, &startTime, &completionTime)
Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())

template := &mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template
Expect(len(template.Spec.Containers) == 1).To(BeTrue())

cont := &template.Spec.Containers[0]

for k, v := range testCase.envVariables {
cont.Env = append(cont.Env,
corev1.EnvVar{
Name: k,
Value: v,
},
)
}

launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", false)

Expect(len(launcher.Spec.Containers) == 1).To(BeTrue())
for expectedKey, expectedValue := range testCase.expectedEnvVariables {
Expect(launcher.Spec.Containers[0].Env).Should(ContainElements(
corev1.EnvVar{
Name: expectedKey,
Value: expectedValue,
}),
)
}
}
})
})
})

func ReplicaStatusMatch(replicaStatuses map[common.ReplicaType]*common.ReplicaStatus,
Expand Down

0 comments on commit 7383114

Please sign in to comment.