-
Notifications
You must be signed in to change notification settings - Fork 698
[RayJob] Wait for workers before submitted jobs in sidecar mode #4429
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -81,6 +81,30 @@ func GetMetadataJson(metadata map[string]string, rayVersion string) (string, err | |
| return pkgutils.ConvertByteSliceToString(metadataBytes), nil | ||
| } | ||
|
|
||
| // GetMinReplicasFromSpec calculates the minimum expected worker replicas from the RayClusterSpec. | ||
| // This is used in SidecarMode to determine how many workers should be registered before submitting the job. | ||
| func GetMinReplicasFromSpec(rayClusterSpec *rayv1.RayClusterSpec) int32 { | ||
| if rayClusterSpec == nil { | ||
| return 0 | ||
| } | ||
| count := int32(0) | ||
| for _, nodeGroup := range rayClusterSpec.WorkerGroupSpecs { | ||
| if nodeGroup.Suspend != nil && *nodeGroup.Suspend { | ||
| continue | ||
| } | ||
| minReplicas := int32(0) | ||
| if nodeGroup.MinReplicas != nil && *nodeGroup.MinReplicas > 0 { | ||
| minReplicas = *nodeGroup.MinReplicas | ||
| } else if nodeGroup.Replicas != nil && *nodeGroup.Replicas > 0 { | ||
| // Fall back to Replicas when MinReplicas is not set or is 0. | ||
| // This handles static clusters where users only set Replicas. | ||
| minReplicas = *nodeGroup.Replicas | ||
| } | ||
| count += minReplicas * nodeGroup.NumOfHosts | ||
| } | ||
| return count | ||
| } | ||
|
|
||
| // BuildJobSubmitCommand builds the `ray job submit` command based on submission mode. | ||
| func BuildJobSubmitCommand(rayJobInstance *rayv1.RayJob, submissionMode rayv1.JobSubmissionMode) ([]string, error) { | ||
| var address string | ||
|
|
@@ -139,6 +163,33 @@ func BuildJobSubmitCommand(rayJobInstance *rayv1.RayJob, submissionMode rayv1.Jo | |
| "do", "echo", strconv.Quote("Waiting for Ray Dashboard GCS to become healthy at " + address + " ..."), ";", "sleep", "2", ";", "done", ";", | ||
| } | ||
| cmd = append(cmd, waitLoop...) | ||
|
|
||
| // Wait for the expected number of worker nodes to register for the Ray cluster. | ||
| // RAY_EXPECTED_MIN_WORKERS is set by the controller based on the MinReplicas in the RayClusterSpec. | ||
| // The loop queries the Ray Dashboard API to get the number of alive nodes and | ||
| // continues until the number of alive nodes is equal to (expected_workers + 1) for head node. | ||
| // This ensures that worker pods are connected before the job is submitted otherwise | ||
| // the jobs may run on the Head node. | ||
| // | ||
| // Note: This loop will never timeout and will wait indefinitely if workers never register. | ||
| // This can be mitigated by setting the RayJob's `activeDeadlineSeconds` field | ||
| // to enforce a maximum job execution time. | ||
| // | ||
| // The wget command includes the x-ray-authorization header if RAY_AUTH_TOKEN is set. | ||
| // This is required when Ray auth token mode is enabled, otherwise the request will fail with 401. | ||
| wgetAuthHeader := "${" + utils.RAY_AUTH_TOKEN_ENV_VAR + ":+--header \"x-ray-authorization: Bearer $" + utils.RAY_AUTH_TOKEN_ENV_VAR + "\"}" | ||
| waitForNodesLoop := []string{ | ||
| "if", "[", "-n", "\"$" + utils.RAY_EXPECTED_MIN_WORKERS + "\"", "]", "&&", "[", "\"$" + utils.RAY_EXPECTED_MIN_WORKERS + "\"", "-gt", "\"0\"", "]", ";", "then", | ||
| "EXPECTED_NODES=$(($" + utils.RAY_EXPECTED_MIN_WORKERS + " + 1))", ";", | ||
| "echo", strconv.Quote("Waiting for $EXPECTED_NODES nodes (1 head + $" + utils.RAY_EXPECTED_MIN_WORKERS + " workers) to register..."), ";", | ||
| "until", "[", | ||
| "\"$(wget " + wgetAuthHeader + " -q -O- " + address + "/nodes?view=summary 2>/dev/null | python3 -c \"import sys,json; d=json.load(sys.stdin); print(len([n for n in d.get('data',{}).get('summary',[]) if n.get('raylet',{}).get('state','')=='ALIVE']))\" 2>/dev/null || echo 0)\"", | ||
| "-ge", "\"$EXPECTED_NODES\"", "]", ";", | ||
| "do", "echo", strconv.Quote("Waiting for Ray nodes to register. Expected: $EXPECTED_NODES ..."), ";", "sleep", "2", ";", "done", ";", | ||
| "echo", strconv.Quote("All expected nodes are registered."), ";", | ||
| "fi", ";", | ||
| } | ||
| cmd = append(cmd, waitForNodesLoop...) | ||
win5923 marked this conversation as resolved.
Show resolved
Hide resolved
Comment on lines
143
to
174
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use something like
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Future-Outlier - I pushed a commit that uses python's urllib instead of wget. |
||
| } | ||
|
|
||
| // In Sidecar mode, we only support RayJob level retry, which means that the submitter retry won't happen, | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -613,6 +613,13 @@ func configureSubmitterContainer(container *corev1.Container, rayJobInstance *ra | |||||||||||||||||||
| // ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID ... | ||||||||||||||||||||
| container.Env = append(container.Env, corev1.EnvVar{Name: utils.RAY_DASHBOARD_ADDRESS, Value: rayJobInstance.Status.DashboardURL}) | ||||||||||||||||||||
| container.Env = append(container.Env, corev1.EnvVar{Name: utils.RAY_JOB_SUBMISSION_ID, Value: rayJobInstance.Status.JobId}) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| // In SidecarMode, pass the expected minimum worker count so the submitter can wait for workers to register | ||||||||||||||||||||
| if submissionMode == rayv1.SidecarMode && rayJobInstance.Spec.RayClusterSpec != nil { | ||||||||||||||||||||
| minWorkers := common.GetMinReplicasFromSpec(rayJobInstance.Spec.RayClusterSpec) | ||||||||||||||||||||
|
||||||||||||||||||||
| // CalculateDesiredReplicas calculate desired worker replicas at the cluster level | |
| func CalculateDesiredReplicas(ctx context.Context, cluster *rayv1.RayCluster) int32 { | |
| count := int32(0) | |
| for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs { | |
| count += GetWorkerGroupDesiredReplicas(ctx, nodeGroup) | |
| } | |
| return count | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since RayJob in Kubernetes Job mode waits for the RayCluster to reach the Ready state, and the RayCluster’s Ready state requires all pods to be running, including both the head pod and worker pods. The number of worker pods is determined by CalculateDesiredReplicas.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we use the function
CalculateDesiredReplicas?kuberay/ray-operator/controllers/ray/utils/util.go
Lines 410 to 418 in 79b5c30