-
Notifications
You must be signed in to change notification settings - Fork 481
Improve RayJob InTreeAutoscaling e2e test #8174
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 9 commits
f6473d3
a920635
1fbe026
fcab22b
6594c2b
f750873
777d87c
f0b8b2b
a13d88c
6c82b0a
60244b9
66ef4a1
4112eef
3e6d2ed
ecb7f05
13a84de
51f2684
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,12 +17,15 @@ limitations under the License. | |
| package e2e | ||
|
|
||
| import ( | ||
| "strings" | ||
|
|
||
| "github.com/onsi/ginkgo/v2" | ||
| "github.com/onsi/gomega" | ||
| rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" | ||
| corev1 "k8s.io/api/core/v1" | ||
| apimeta "k8s.io/apimachinery/pkg/api/meta" | ||
| "k8s.io/apimachinery/pkg/api/resource" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/types" | ||
| "sigs.k8s.io/controller-runtime/pkg/client" | ||
|
|
||
|
|
@@ -50,6 +53,17 @@ var _ = ginkgo.Describe("Kuberay", func() { | |
| lq *kueue.LocalQueue | ||
| ) | ||
|
|
||
| // countRunningWorkerPods counts the number of running pods that have "workers" in their name | ||
| countRunningWorkerPods := func(podList *corev1.PodList) int { | ||
| workerPodCount := 0 | ||
| for _, pod := range podList.Items { | ||
| if strings.Contains(pod.Name, "workers") && pod.Status.Phase == corev1.PodRunning { | ||
| workerPodCount++ | ||
| } | ||
| } | ||
| return workerPodCount | ||
| } | ||
|
|
||
| ginkgo.BeforeEach(func() { | ||
| ns = util.CreateNamespaceFromPrefixWithLog(ctx, k8sClient, "kuberay-e2e-") | ||
| rf = utiltestingapi.MakeResourceFlavor(resourceFlavorName). | ||
|
|
@@ -60,7 +74,7 @@ var _ = ginkgo.Describe("Kuberay", func() { | |
| cq = utiltestingapi.MakeClusterQueue(clusterQueueName). | ||
| ResourceGroup( | ||
| *utiltestingapi.MakeFlavorQuotas(rf.Name). | ||
| Resource(corev1.ResourceCPU, "1").Obj()). | ||
| Resource(corev1.ResourceCPU, "3").Obj()). | ||
| Obj() | ||
| util.CreateClusterQueuesAndWaitForActive(ctx, k8sClient, cq) | ||
|
|
||
|
|
@@ -140,19 +154,46 @@ var _ = ginkgo.Describe("Kuberay", func() { | |
| }) | ||
| }) | ||
|
|
||
| // TODO enhance to test scale up / down operations | ||
| // See https://github.com/kubernetes-sigs/kueue/pull/8082#discussion_r2605582024 | ||
| ginkgo.It("Should run a rayjob with InTreeAutoscaling", func() { | ||
| kuberayTestImage := util.GetKuberayTestImage() | ||
|
|
||
| // Create ConfigMap with Python script | ||
| configMap := &corev1.ConfigMap{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "rayjob-autoscaling", | ||
| Namespace: ns.Name, | ||
| }, | ||
| Data: map[string]string{ | ||
| "sample_code.py": `import ray | ||
| import os | ||
|
|
||
| ray.init() | ||
|
|
||
| @ray.remote | ||
| def my_task(x, s): | ||
| import time | ||
| time.sleep(s) | ||
| return x * x | ||
|
|
||
| # run tasks in sequence to avoid triggering autoscaling in the beginning | ||
| print([ray.get(my_task.remote(i, 1)) for i in range(4)]) | ||
|
|
||
| # run tasks in parallel to trigger autoscaling (scaling up) | ||
| print(ray.get([my_task.remote(i, 4) for i in range(16)])) | ||
|
|
||
| # run tasks in sequence to trigger scaling down | ||
| print([ray.get(my_task.remote(i, 1)) for i in range(16)])`, | ||
| }, | ||
| } | ||
|
|
||
| rayJob := testingrayjob.MakeJob("rayjob-autoscaling", ns.Name). | ||
| Queue(localQueueName). | ||
| Annotation(workloadslicing.EnabledAnnotationKey, workloadslicing.EnabledAnnotationValue). | ||
| EnableInTreeAutoscaling(). | ||
| WithSubmissionMode(rayv1.K8sJobMode). | ||
| Entrypoint("python -c \"import ray; ray.init(); print(ray.cluster_resources())\""). | ||
| RequestAndLimit(rayv1.HeadNode, corev1.ResourceCPU, "300m"). | ||
| RequestAndLimit(rayv1.WorkerNode, corev1.ResourceCPU, "300m"). | ||
| Entrypoint("python /home/ray/samples/sample_code.py"). | ||
| RequestAndLimit(rayv1.HeadNode, corev1.ResourceCPU, "200m"). | ||
| RequestAndLimit(rayv1.WorkerNode, corev1.ResourceCPU, "200m"). | ||
| WithSubmitterPodTemplate(corev1.PodTemplateSpec{ | ||
| Spec: corev1.PodSpec{ | ||
| Containers: []corev1.Container{ | ||
|
|
@@ -161,10 +202,10 @@ var _ = ginkgo.Describe("Kuberay", func() { | |
| Image: kuberayTestImage, | ||
| Resources: corev1.ResourceRequirements{ | ||
| Requests: corev1.ResourceList{ | ||
| corev1.ResourceCPU: resource.MustParse("300m"), | ||
| corev1.ResourceCPU: resource.MustParse("200m"), | ||
| }, | ||
| Limits: corev1.ResourceList{ | ||
| corev1.ResourceCPU: resource.MustParse("300m"), | ||
| corev1.ResourceCPU: resource.MustParse("200m"), | ||
| }, | ||
| }, | ||
| }, | ||
|
|
@@ -175,19 +216,42 @@ var _ = ginkgo.Describe("Kuberay", func() { | |
| Image(rayv1.HeadNode, kuberayTestImage). | ||
| Image(rayv1.WorkerNode, kuberayTestImage).Obj() | ||
|
|
||
| // Add volume and volumeMount to head node for the ConfigMap | ||
| rayJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Volumes = []corev1.Volume{ | ||
| { | ||
| Name: "script-volume", | ||
| VolumeSource: corev1.VolumeSource{ | ||
| ConfigMap: &corev1.ConfigMapVolumeSource{ | ||
| LocalObjectReference: corev1.LocalObjectReference{ | ||
| Name: "rayjob-autoscaling", | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| } | ||
| rayJob.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[0].VolumeMounts = []corev1.VolumeMount{ | ||
| { | ||
| Name: "script-volume", | ||
| MountPath: "/home/ray/samples", | ||
| }, | ||
| } | ||
|
|
||
| ginkgo.By("Creating the ConfigMap", func() { | ||
| gomega.Expect(k8sClient.Create(ctx, configMap)).Should(gomega.Succeed()) | ||
| }) | ||
|
|
||
| ginkgo.By("Creating the rayJob", func() { | ||
hiboyang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| gomega.Expect(k8sClient.Create(ctx, rayJob)).Should(gomega.Succeed()) | ||
| }) | ||
|
|
||
| ginkgo.By("Checking one workload is created and admitted", func() { | ||
| ginkgo.By("Checking one workload is created", func() { | ||
| gomega.Eventually(func(g gomega.Gomega) { | ||
| workloadList := &kueue.WorkloadList{} | ||
| g.Expect(k8sClient.List(ctx, workloadList, client.InNamespace(ns.Name))).To(gomega.Succeed()) | ||
| g.Expect(workloadList.Items).NotTo(gomega.BeEmpty(), "Expected at least one workload in namespace") | ||
| hasAdmittedWorkload := false | ||
| for _, wl := range workloadList.Items { | ||
| if apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadAdmitted) || | ||
| apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) { | ||
| if apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadAdmitted) { | ||
| hasAdmittedWorkload = true | ||
| break | ||
| } | ||
|
|
@@ -205,6 +269,56 @@ var _ = ginkgo.Describe("Kuberay", func() { | |
| }, util.VeryLongTimeout, util.Interval).Should(gomega.Succeed()) | ||
| }) | ||
|
|
||
| ginkgo.By("Waiting for 3 pods in rayjob namespace", func() { | ||
| // 3 rayjob pods: head, worker, submitter job | ||
| gomega.Eventually(func(g gomega.Gomega) { | ||
| podList := &corev1.PodList{} | ||
| g.Expect(k8sClient.List(ctx, podList, client.InNamespace(ns.Name))).To(gomega.Succeed()) | ||
| g.Expect(podList.Items).To(gomega.HaveLen(3), "Expected exactly 3 pods in rayjob namespace") | ||
| // Count pods that have "workers" in their name | ||
| workerPodCount := countRunningWorkerPods(podList) | ||
| g.Expect(workerPodCount).To(gomega.Equal(1), "Expected exactly 1 pod with 'workers' in the name") | ||
| }, util.VeryLongTimeout, util.Interval).Should(gomega.Succeed()) | ||
|
||
| }) | ||
|
|
||
| ginkgo.By("Waiting for 2 workloads", func() { | ||
| // 2 workloads: one for the ray job, another for the submitter job created by the ray job | ||
hiboyang marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| gomega.Eventually(func(g gomega.Gomega) { | ||
| workloadList := &kueue.WorkloadList{} | ||
| g.Expect(k8sClient.List(ctx, workloadList, client.InNamespace(ns.Name))).To(gomega.Succeed()) | ||
| g.Expect(workloadList.Items).To(gomega.HaveLen(2), "Expected exactly 2 workloads") | ||
| }, util.VeryLongTimeout, util.Interval).Should(gomega.Succeed()) | ||
| }) | ||
|
|
||
| ginkgo.By("Waiting for 5 workers due to scaling up", func() { | ||
| gomega.Eventually(func(g gomega.Gomega) { | ||
| podList := &corev1.PodList{} | ||
| g.Expect(k8sClient.List(ctx, podList, client.InNamespace(ns.Name))).To(gomega.Succeed()) | ||
| // Count pods that have "workers" in their name | ||
| workerPodCount := countRunningWorkerPods(podList) | ||
| g.Expect(workerPodCount).To(gomega.Equal(5), "Expected exactly 5 pods with 'workers' in the name") | ||
|
||
| }, util.VeryLongTimeout, util.Interval).Should(gomega.Succeed()) | ||
| }) | ||
|
|
||
| ginkgo.By("Waiting for 3 workloads due to scaling up creating another workload", func() { | ||
| // 3 workloads now, after scaling up, a new workload will be created for the new resource request | ||
| gomega.Eventually(func(g gomega.Gomega) { | ||
| workloadList := &kueue.WorkloadList{} | ||
| g.Expect(k8sClient.List(ctx, workloadList, client.InNamespace(ns.Name))).To(gomega.Succeed()) | ||
| g.Expect(workloadList.Items).To(gomega.HaveLen(3), "Expected exactly 3 workloads") | ||
| }, util.VeryLongTimeout, util.Interval).Should(gomega.Succeed()) | ||
| }) | ||
|
|
||
| ginkgo.By("Waiting for workers reduced to 1 due to scaling down", func() { | ||
| gomega.Eventually(func(g gomega.Gomega) { | ||
| podList := &corev1.PodList{} | ||
| g.Expect(k8sClient.List(ctx, podList, client.InNamespace(ns.Name))).To(gomega.Succeed()) | ||
| // Count pods that have "workers" in their name | ||
| workerPodCount := countRunningWorkerPods(podList) | ||
| g.Expect(workerPodCount).To(gomega.Equal(1), "Expected exactly 1 pods with 'workers' in the name") | ||
| }, util.VeryLongTimeout, util.Interval).Should(gomega.Succeed()) | ||
| }) | ||
|
|
||
| ginkgo.By("Waiting for the RayJob to finish", func() { | ||
| gomega.Eventually(func(g gomega.Gomega) { | ||
| createdRayJob := &rayv1.RayJob{} | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.