Skip to content

Commit a6c2da8

Browse files
authored
run worker process in launcher pod (#612)
* run worker in launcher pod; fix DCO issue Signed-off-by: kuizhiqing <[email protected]> * use ptr.Deref Signed-off-by: kuizhiqing <[email protected]> * update manifest Signed-off-by: kuizhiqing <[email protected]> * more Deref Signed-off-by: kuizhiqing <[email protected]> * create one service for both launcher and worker Signed-off-by: kuizhiqing <[email protected]> --------- Signed-off-by: kuizhiqing <[email protected]>
1 parent a1ff84c commit a6c2da8

File tree

12 files changed

+207
-79
lines changed

12 files changed

+207
-79
lines changed

deploy/v2beta1/mpi-operator.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -8190,6 +8190,11 @@ spec:
81908190
description: MPIReplicaSpecs contains maps from `MPIReplicaType` to
81918191
`ReplicaSpec` that specify the MPI replicas to run.
81928192
type: object
8193+
runLauncherAsWorker:
8194+
default: false
8195+
description: RunLauncherAsWorker indicates whether to run worker process
8196+
in launcher Defaults to false.
8197+
type: boolean
81938198
runPolicy:
81948199
description: RunPolicy encapsulates various runtime policies of the
81958200
job.

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ require (
1515
k8s.io/code-generator v0.27.4
1616
k8s.io/klog v1.0.0
1717
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f
18-
k8s.io/utils v0.0.0-20230209194617-a36077c30491
18+
k8s.io/utils v0.0.0-20240102154912-e7106e64919e
1919
sigs.k8s.io/controller-runtime v0.15.1
2020
sigs.k8s.io/scheduler-plugins v0.26.7
2121
sigs.k8s.io/structured-merge-diff/v4 v4.2.3

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -384,8 +384,8 @@ k8s.io/klog/v2 v2.90.1 h1:m4bYOKall2MmOiRaR1J+We67Do7vm9KiQVlT96lnHUw=
384384
k8s.io/klog/v2 v2.90.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0=
385385
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f h1:2kWPakN3i/k81b0gvD5C5FJ2kxm1WrQFanWchyKuqGg=
386386
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f/go.mod h1:byini6yhqGC14c3ebc/QwanvYwhuMWF6yz2F8uwW8eg=
387-
k8s.io/utils v0.0.0-20230209194617-a36077c30491 h1:r0BAOLElQnnFhE/ApUsg3iHdVYYPBjNSSOMowRZxxsY=
388-
k8s.io/utils v0.0.0-20230209194617-a36077c30491/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
387+
k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ=
388+
k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
389389
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.1.2 h1:trsWhjU5jZrx6UvFu4WzQDrN7Pga4a7Qg+zcfcj64PA=
390390
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.1.2/go.mod h1:+qG7ISXqCDVVcyO8hLn12AKVYYUjM7ftlqsqmrhMZE0=
391391
sigs.k8s.io/controller-runtime v0.15.1 h1:9UvgKD4ZJGcj24vefUFgZFP3xej/3igL9BsOUTb/+4c=

manifests/base/kubeflow.org_mpijobs.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -8167,6 +8167,11 @@ spec:
81678167
description: MPIReplicaSpecs contains maps from `MPIReplicaType` to
81688168
`ReplicaSpec` that specify the MPI replicas to run.
81698169
type: object
8170+
runLauncherAsWorker:
8171+
default: false
8172+
description: RunLauncherAsWorker indicates whether to run worker process
8173+
in launcher Defaults to false.
8174+
type: boolean
81708175
runPolicy:
81718176
description: RunPolicy encapsulates various runtime policies of the
81728177
job.

pkg/apis/kubeflow/v2beta1/swagger.json

+4
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,10 @@
156156
"$ref": "#/definitions/v2beta1.ReplicaSpec"
157157
}
158158
},
159+
"runLauncherAsWorker": {
160+
"description": "RunLauncherAsWorker indicates whether to run worker process in launcher Defaults to false.",
161+
"type": "boolean"
162+
},
159163
"runPolicy": {
160164
"description": "RunPolicy encapsulates various runtime policies of the job.",
161165
"default": {},

pkg/apis/kubeflow/v2beta1/types.go

+6
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,12 @@ type MPIJobSpec struct {
154154
// +kubebuilder:default:=1
155155
SlotsPerWorker *int32 `json:"slotsPerWorker,omitempty"`
156156

157+
// RunLauncherAsWorker indicates whether to run worker process in launcher
158+
// Defaults to false.
159+
// +optional
160+
// +kubebuilder:default:=false
161+
RunLauncherAsWorker *bool `json:"runLauncherAsWorker,omitempty"`
162+
157163
// RunPolicy encapsulates various runtime policies of the job.
158164
RunPolicy RunPolicy `json:"runPolicy,omitempty"`
159165

pkg/apis/kubeflow/v2beta1/zz_generated.deepcopy.go

+5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/client/applyconfiguration/kubeflow/v2beta1/mpijobspec.go

+9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/controller/mpi_job_controller.go

+42-26
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ import (
5454
"k8s.io/klog"
5555
"k8s.io/utils/clock"
5656
"k8s.io/utils/pointer"
57+
"k8s.io/utils/ptr"
5758
schedclientset "sigs.k8s.io/scheduler-plugins/pkg/generated/clientset/versioned"
5859
volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"
5960

@@ -630,7 +631,7 @@ func (c *MPIJobController) syncHandler(key string) error {
630631
// We're done if the launcher either succeeded or failed.
631632
done := launcher != nil && isJobFinished(launcher)
632633
if !done {
633-
_, err := c.getOrCreateService(mpiJob, newWorkersService(mpiJob))
634+
_, err := c.getOrCreateService(mpiJob, newJobService(mpiJob))
634635
if err != nil {
635636
return fmt.Errorf("getting or creating Service to front workers: %w", err)
636637
}
@@ -656,16 +657,6 @@ func (c *MPIJobController) syncHandler(key string) error {
656657
return err
657658
}
658659
}
659-
if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel ||
660-
mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationMPICH {
661-
// The Intel and MPICH implementations require workers to communicate with the
662-
// launcher through its hostname. For that, we create a Service which
663-
// has the same name as the launcher's hostname.
664-
_, err := c.getOrCreateService(mpiJob, newLauncherService(mpiJob))
665-
if err != nil {
666-
return fmt.Errorf("getting or creating Service to front launcher: %w", err)
667-
}
668-
}
669660
if launcher == nil {
670661
if mpiJob.Spec.LauncherCreationPolicy == kubeflow.LauncherCreationPolicyAtStartup || c.countReadyWorkerPods(worker) == len(worker) {
671662
launcher, err = c.kubeClient.BatchV1().Jobs(namespace).Create(context.TODO(), c.newLauncherJob(mpiJob), metav1.CreateOptions{})
@@ -1279,17 +1270,30 @@ func (c *MPIJobController) doUpdateJobStatus(mpiJob *kubeflow.MPIJob) error {
12791270
// handleObject can discover the MPIJob resource that 'owns' it.
12801271
func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigMap {
12811272
var buffer bytes.Buffer
1282-
workersService := mpiJob.Name + workerSuffix
12831273
slots := 1
12841274
if mpiJob.Spec.SlotsPerWorker != nil {
12851275
slots = int(*mpiJob.Spec.SlotsPerWorker)
12861276
}
1277+
// note that pod.spec.dnsConfig also affect the svc resolution
1278+
// ref: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/
1279+
// launcher can be reach with hostname or service name
1280+
if ptr.Deref(mpiJob.Spec.RunLauncherAsWorker, false) {
1281+
name := mpiJob.Name + launcherSuffix
1282+
switch mpiJob.Spec.MPIImplementation {
1283+
case kubeflow.MPIImplementationOpenMPI:
1284+
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc slots=%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
1285+
case kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH:
1286+
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc:%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
1287+
}
1288+
}
1289+
12871290
for i := 0; i < int(workerReplicas); i++ {
1291+
name := workerName(mpiJob, i)
12881292
switch mpiJob.Spec.MPIImplementation {
12891293
case kubeflow.MPIImplementationOpenMPI:
1290-
buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc slots=%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots))
1294+
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc slots=%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
12911295
case kubeflow.MPIImplementationIntel, kubeflow.MPIImplementationMPICH:
1292-
buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc:%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots))
1296+
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc:%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
12931297
}
12941298
}
12951299

@@ -1319,22 +1323,27 @@ func updateDiscoverHostsInConfigMap(configMap *corev1.ConfigMap, mpiJob *kubeflo
13191323

13201324
var buffer bytes.Buffer
13211325
buffer.WriteString("#!/bin/sh\n")
1322-
workersService := mpiJob.Name + workerSuffix
1326+
1327+
// We don't check if launcher is running here, launcher should always be there or the job failed
1328+
if ptr.Deref(mpiJob.Spec.RunLauncherAsWorker, false) {
1329+
name := mpiJob.Name + launcherSuffix
1330+
buffer.WriteString(fmt.Sprintf("echo %s.%s.%s.svc\n", name, mpiJob.Name, mpiJob.Namespace))
1331+
}
1332+
13231333
for _, p := range runningPods {
1324-
buffer.WriteString(fmt.Sprintf("echo %s.%s.%s.svc\n", p.Name, workersService, p.Namespace))
1334+
buffer.WriteString(fmt.Sprintf("echo %s.%s.%s.svc\n", p.Name, mpiJob.Name, p.Namespace))
13251335
}
13261336

13271337
configMap.Data[discoverHostsScriptName] = buffer.String()
13281338
}
13291339

1330-
// newWorkersService creates a new workers' Service for an MPIJob resource.
1331-
func newWorkersService(job *kubeflow.MPIJob) *corev1.Service {
1332-
return newService(job, job.Name+workerSuffix, defaultLabels(job.Name, worker))
1333-
}
1334-
1335-
// newLauncherService creates a new launcher's Service for an MPIJob resource.
1336-
func newLauncherService(job *kubeflow.MPIJob) *corev1.Service {
1337-
return newService(job, job.Name+launcherSuffix, defaultLabels(job.Name, launcher))
1340+
// newJobService creates a Service with the same name of Job for both launcher and worker pods
1341+
func newJobService(job *kubeflow.MPIJob) *corev1.Service {
1342+
labels := map[string]string{
1343+
kubeflow.OperatorNameLabel: kubeflow.OperatorName,
1344+
kubeflow.JobNameLabel: job.Name,
1345+
}
1346+
return newService(job, job.Name, labels)
13381347
}
13391348

13401349
func newService(job *kubeflow.MPIJob, name string, selector map[string]string) *corev1.Service {
@@ -1416,12 +1425,19 @@ func (c *MPIJobController) newWorker(mpiJob *kubeflow.MPIJob, index int) *corev1
14161425
}
14171426
podTemplate.Labels[kubeflow.ReplicaIndexLabel] = strconv.Itoa(index)
14181427
podTemplate.Spec.Hostname = name
1419-
podTemplate.Spec.Subdomain = mpiJob.Name + workerSuffix // Matches workers' Service name.
1428+
podTemplate.Spec.Subdomain = mpiJob.Name // Matches job' Service name.
14201429
if podTemplate.Spec.HostNetwork {
14211430
// Allows resolution of worker hostnames without needing to include the
14221431
// namespace or cluster domain.
14231432
podTemplate.Spec.DNSPolicy = corev1.DNSClusterFirstWithHostNet
14241433
}
1434+
// The Intel and MPICH implementations require workers to communicate with the launcher through its hostname.
1435+
searche := fmt.Sprintf("%s.%s.svc.cluster.local", mpiJob.Name, mpiJob.Namespace)
1436+
if podTemplate.Spec.DNSConfig == nil {
1437+
podTemplate.Spec.DNSConfig = &corev1.PodDNSConfig{Searches: []string{searche}}
1438+
} else {
1439+
podTemplate.Spec.DNSConfig.Searches = append(podTemplate.Spec.DNSConfig.Searches, searche)
1440+
}
14251441
setRestartPolicy(podTemplate, mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker])
14261442

14271443
container := &podTemplate.Spec.Containers[0]
@@ -1494,7 +1510,7 @@ func (c *MPIJobController) newLauncherPodTemplate(mpiJob *kubeflow.MPIJob) corev
14941510
c.PodGroupCtrl.decoratePodTemplateSpec(podTemplate, mpiJob.Name)
14951511
}
14961512
podTemplate.Spec.Hostname = launcherName
1497-
podTemplate.Spec.Subdomain = mpiJob.Name + workerSuffix // Matches workers' Service name.
1513+
podTemplate.Spec.Subdomain = mpiJob.Name // Matches job' Service name.
14981514
if podTemplate.Spec.HostNetwork {
14991515
// Allows resolution of worker hostnames without needing to include the
15001516
// namespace or cluster domain.

0 commit comments

Comments
 (0)