@@ -54,6 +54,7 @@ import (
54
54
"k8s.io/klog"
55
55
"k8s.io/utils/clock"
56
56
"k8s.io/utils/pointer"
57
+ "k8s.io/utils/ptr"
57
58
schedclientset "sigs.k8s.io/scheduler-plugins/pkg/generated/clientset/versioned"
58
59
volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"
59
60
@@ -630,7 +631,7 @@ func (c *MPIJobController) syncHandler(key string) error {
630
631
// We're done if the launcher either succeeded or failed.
631
632
done := launcher != nil && isJobFinished (launcher )
632
633
if ! done {
633
- _ , err := c .getOrCreateService (mpiJob , newWorkersService (mpiJob ))
634
+ _ , err := c .getOrCreateService (mpiJob , newJobService (mpiJob ))
634
635
if err != nil {
635
636
return fmt .Errorf ("getting or creating Service to front workers: %w" , err )
636
637
}
@@ -656,16 +657,6 @@ func (c *MPIJobController) syncHandler(key string) error {
656
657
return err
657
658
}
658
659
}
659
- if mpiJob .Spec .MPIImplementation == kubeflow .MPIImplementationIntel ||
660
- mpiJob .Spec .MPIImplementation == kubeflow .MPIImplementationMPICH {
661
- // The Intel and MPICH implementations require workers to communicate with the
662
- // launcher through its hostname. For that, we create a Service which
663
- // has the same name as the launcher's hostname.
664
- _ , err := c .getOrCreateService (mpiJob , newLauncherService (mpiJob ))
665
- if err != nil {
666
- return fmt .Errorf ("getting or creating Service to front launcher: %w" , err )
667
- }
668
- }
669
660
if launcher == nil {
670
661
if mpiJob .Spec .LauncherCreationPolicy == kubeflow .LauncherCreationPolicyAtStartup || c .countReadyWorkerPods (worker ) == len (worker ) {
671
662
launcher , err = c .kubeClient .BatchV1 ().Jobs (namespace ).Create (context .TODO (), c .newLauncherJob (mpiJob ), metav1.CreateOptions {})
@@ -1279,17 +1270,30 @@ func (c *MPIJobController) doUpdateJobStatus(mpiJob *kubeflow.MPIJob) error {
1279
1270
// handleObject can discover the MPIJob resource that 'owns' it.
1280
1271
func newConfigMap (mpiJob * kubeflow.MPIJob , workerReplicas int32 ) * corev1.ConfigMap {
1281
1272
var buffer bytes.Buffer
1282
- workersService := mpiJob .Name + workerSuffix
1283
1273
slots := 1
1284
1274
if mpiJob .Spec .SlotsPerWorker != nil {
1285
1275
slots = int (* mpiJob .Spec .SlotsPerWorker )
1286
1276
}
1277
+ // note that pod.spec.dnsConfig also affect the svc resolution
1278
+ // ref: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/
1279
+ // launcher can be reach with hostname or service name
1280
+ if ptr .Deref (mpiJob .Spec .RunLauncherAsWorker , false ) {
1281
+ name := mpiJob .Name + launcherSuffix
1282
+ switch mpiJob .Spec .MPIImplementation {
1283
+ case kubeflow .MPIImplementationOpenMPI :
1284
+ buffer .WriteString (fmt .Sprintf ("%s.%s.%s.svc slots=%d\n " , name , mpiJob .Name , mpiJob .Namespace , slots ))
1285
+ case kubeflow .MPIImplementationIntel , kubeflow .MPIImplementationMPICH :
1286
+ buffer .WriteString (fmt .Sprintf ("%s.%s.%s.svc:%d\n " , name , mpiJob .Name , mpiJob .Namespace , slots ))
1287
+ }
1288
+ }
1289
+
1287
1290
for i := 0 ; i < int (workerReplicas ); i ++ {
1291
+ name := workerName (mpiJob , i )
1288
1292
switch mpiJob .Spec .MPIImplementation {
1289
1293
case kubeflow .MPIImplementationOpenMPI :
1290
- buffer .WriteString (fmt .Sprintf ("%s%s-%d .%s.%s.svc slots=%d\n " , mpiJob .Name , workerSuffix , i , workersService , mpiJob .Namespace , slots ))
1294
+ buffer .WriteString (fmt .Sprintf ("%s.%s.%s.svc slots=%d\n " , name , mpiJob .Name , mpiJob .Namespace , slots ))
1291
1295
case kubeflow .MPIImplementationIntel , kubeflow .MPIImplementationMPICH :
1292
- buffer .WriteString (fmt .Sprintf ("%s%s-%d .%s.%s.svc:%d\n " , mpiJob .Name , workerSuffix , i , workersService , mpiJob .Namespace , slots ))
1296
+ buffer .WriteString (fmt .Sprintf ("%s.%s.%s.svc:%d\n " , name , mpiJob .Name , mpiJob .Namespace , slots ))
1293
1297
}
1294
1298
}
1295
1299
@@ -1319,22 +1323,27 @@ func updateDiscoverHostsInConfigMap(configMap *corev1.ConfigMap, mpiJob *kubeflo
1319
1323
1320
1324
var buffer bytes.Buffer
1321
1325
buffer .WriteString ("#!/bin/sh\n " )
1322
- workersService := mpiJob .Name + workerSuffix
1326
+
1327
+ // We don't check if launcher is running here, launcher should always be there or the job failed
1328
+ if ptr .Deref (mpiJob .Spec .RunLauncherAsWorker , false ) {
1329
+ name := mpiJob .Name + launcherSuffix
1330
+ buffer .WriteString (fmt .Sprintf ("echo %s.%s.%s.svc\n " , name , mpiJob .Name , mpiJob .Namespace ))
1331
+ }
1332
+
1323
1333
for _ , p := range runningPods {
1324
- buffer .WriteString (fmt .Sprintf ("echo %s.%s.%s.svc\n " , p .Name , workersService , p .Namespace ))
1334
+ buffer .WriteString (fmt .Sprintf ("echo %s.%s.%s.svc\n " , p .Name , mpiJob . Name , p .Namespace ))
1325
1335
}
1326
1336
1327
1337
configMap .Data [discoverHostsScriptName ] = buffer .String ()
1328
1338
}
1329
1339
1330
- // newWorkersService creates a new workers' Service for an MPIJob resource.
1331
- func newWorkersService (job * kubeflow.MPIJob ) * corev1.Service {
1332
- return newService (job , job .Name + workerSuffix , defaultLabels (job .Name , worker ))
1333
- }
1334
-
1335
- // newLauncherService creates a new launcher's Service for an MPIJob resource.
1336
- func newLauncherService (job * kubeflow.MPIJob ) * corev1.Service {
1337
- return newService (job , job .Name + launcherSuffix , defaultLabels (job .Name , launcher ))
1340
+ // newJobService creates a Service with the same name of Job for both launcher and worker pods
1341
+ func newJobService (job * kubeflow.MPIJob ) * corev1.Service {
1342
+ labels := map [string ]string {
1343
+ kubeflow .OperatorNameLabel : kubeflow .OperatorName ,
1344
+ kubeflow .JobNameLabel : job .Name ,
1345
+ }
1346
+ return newService (job , job .Name , labels )
1338
1347
}
1339
1348
1340
1349
func newService (job * kubeflow.MPIJob , name string , selector map [string ]string ) * corev1.Service {
@@ -1416,12 +1425,19 @@ func (c *MPIJobController) newWorker(mpiJob *kubeflow.MPIJob, index int) *corev1
1416
1425
}
1417
1426
podTemplate .Labels [kubeflow .ReplicaIndexLabel ] = strconv .Itoa (index )
1418
1427
podTemplate .Spec .Hostname = name
1419
- podTemplate .Spec .Subdomain = mpiJob .Name + workerSuffix // Matches workers ' Service name.
1428
+ podTemplate .Spec .Subdomain = mpiJob .Name // Matches job ' Service name.
1420
1429
if podTemplate .Spec .HostNetwork {
1421
1430
// Allows resolution of worker hostnames without needing to include the
1422
1431
// namespace or cluster domain.
1423
1432
podTemplate .Spec .DNSPolicy = corev1 .DNSClusterFirstWithHostNet
1424
1433
}
1434
+ // The Intel and MPICH implementations require workers to communicate with the launcher through its hostname.
1435
+ searche := fmt .Sprintf ("%s.%s.svc.cluster.local" , mpiJob .Name , mpiJob .Namespace )
1436
+ if podTemplate .Spec .DNSConfig == nil {
1437
+ podTemplate .Spec .DNSConfig = & corev1.PodDNSConfig {Searches : []string {searche }}
1438
+ } else {
1439
+ podTemplate .Spec .DNSConfig .Searches = append (podTemplate .Spec .DNSConfig .Searches , searche )
1440
+ }
1425
1441
setRestartPolicy (podTemplate , mpiJob .Spec .MPIReplicaSpecs [kubeflow .MPIReplicaTypeWorker ])
1426
1442
1427
1443
container := & podTemplate .Spec .Containers [0 ]
@@ -1494,7 +1510,7 @@ func (c *MPIJobController) newLauncherPodTemplate(mpiJob *kubeflow.MPIJob) corev
1494
1510
c .PodGroupCtrl .decoratePodTemplateSpec (podTemplate , mpiJob .Name )
1495
1511
}
1496
1512
podTemplate .Spec .Hostname = launcherName
1497
- podTemplate .Spec .Subdomain = mpiJob .Name + workerSuffix // Matches workers ' Service name.
1513
+ podTemplate .Spec .Subdomain = mpiJob .Name // Matches job ' Service name.
1498
1514
if podTemplate .Spec .HostNetwork {
1499
1515
// Allows resolution of worker hostnames without needing to include the
1500
1516
// namespace or cluster domain.
0 commit comments