RHOAIENG-60624: fix nvidia GPU tests

pawelpaszki · sutaakar · commit 3d7cf81b5f72 · 2026-05-04T10:59:01.000+02:00
diff --git a/tests/odh/mnist_ray_test.go b/tests/odh/mnist_ray_test.go
@@ -65,7 +65,7 @@ func mnistRay(t *testing.T, numGpus int, gpuResourceName string, rayImage string
 	test := With(t)
 
 	// Create a namespace
-	namespace := test.NewTestNamespace()
+	namespace := test.NewTestNamespace(WithKueueManaged())
 
 	// Ensure Notebook ServiceAccount exists (no extra RBAC)
 	ensureNotebookServiceAccount(test, namespace.Name)
diff --git a/tests/odh/mnist_raytune_hpo_test.go b/tests/odh/mnist_raytune_hpo_test.go
@@ -45,7 +45,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
 	test := With(t)
 
 	// Creating a namespace
-	namespace := test.NewTestNamespace()
+	namespace := test.NewTestNamespace(WithKueueManaged())
 
 	// Ensure Notebook ServiceAccount exists (no extra RBAC)
 	ensureNotebookServiceAccount(test, namespace.Name)
@@ -72,7 +72,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
 							},
 							{
 								Name:         corev1.ResourceName("nvidia.com/gpu"),
-								NominalQuota: resource.MustParse(fmt.Sprint(numGpus)),
+								NominalQuota: resource.MustParse(fmt.Sprint(numGpus * 2)), // 2 workers x numGpus each
 							},
 						},
 					},
diff --git a/tests/odh/ray_finetune_llm_deepspeed_test.go b/tests/odh/ray_finetune_llm_deepspeed_test.go
@@ -44,7 +44,7 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int, modelName string, modelC
 	test := With(t)
 
 	// Create a namespace
-	namespace := test.NewTestNamespace()
+	namespace := test.NewTestNamespace(WithKueueManaged())
 	var workingDirectory, err = os.Getwd()
 	test.Expect(err).ToNot(HaveOccurred())
 
diff --git a/tests/odh/raytune_oai_mr_grpc_test.go b/tests/odh/raytune_oai_mr_grpc_test.go
@@ -46,7 +46,7 @@ func raytuneHpo(t *testing.T, numGpus int) {
 	test := With(t)
 
 	// Create a namespace
-	namespace := test.NewTestNamespace()
+	namespace := test.NewTestNamespace(WithKueueManaged())
 
 	// Ensure Notebook ServiceAccount exists (no extra RBAC)
 	ensureNotebookServiceAccount(test, namespace.Name)
diff --git a/tests/odh/resources/mnist_hpo.py b/tests/odh/resources/mnist_hpo.py
@@ -171,7 +171,7 @@ def train_mnist(config):
         model.parameters(), lr=config["lr"], momentum=config["momentum"]
     )
 
-    while True:
+    for _ in range(5):
         train_func(model, optimizer, train_loader, device)
         acc = test_func(model, test_loader, device)
         metrics = {"mean_accuracy": acc}
@@ -180,12 +180,33 @@ def train_mnist(config):
         if should_checkpoint:
             with tempfile.TemporaryDirectory() as tempdir:
                 torch.save(model.state_dict(), os.path.join(tempdir, "model.pt"))
-                train.report(metrics, checkpoint=Checkpoint.from_directory(tempdir))
+                tune.report(metrics, checkpoint=Checkpoint.from_directory(tempdir))
         else:
-            train.report(metrics)
+            tune.report(metrics)
 
 
 if __name__ == "__main__":
+    import os as _os
+    # Ray 2.35.0's get_air_verbosity() expects int or AirVerbosity enum, but the
+    # RHOAI cluster sets AIR_VERBOSITY as a plain string. Patch at the source so
+    # it works regardless of when/how the env-var is re-injected (e.g. via ray.init).
+    try:
+        import ray.tune.experimental.output as _ray_output
+        import ray.tune.tune as _ray_tune_module
+        _orig_gav = _ray_output.get_air_verbosity
+        def _fixed_gav(verbose):
+            if isinstance(verbose, str):
+                try:
+                    verbose = int(verbose)
+                except (ValueError, TypeError):
+                    verbose = 2
+            return _orig_gav(verbose)
+        _ray_output.get_air_verbosity = _fixed_gav
+        _ray_tune_module.get_air_verbosity = _fixed_gav
+    except Exception:
+        pass
+    _os.environ.pop("AIR_VERBOSITY", None)
+
     # for early stopping
     sched = AsyncHyperBandScheduler()
     gpu_value=int("has to be specified")
@@ -198,12 +219,8 @@ def train_mnist(config):
             scheduler=sched,
             num_samples=5,
         ),
-        run_config=train.RunConfig(
+        run_config=tune.RunConfig(
             name="exp",
-            stop={
-                "mean_accuracy": 0.98,
-                "training_iteration": 5,
-            },
         ),
         param_space={
             "lr": tune.loguniform(1e-4, 1e-2),
diff --git a/tests/odh/resources/mnist_hpo_raytune.ipynb b/tests/odh/resources/mnist_hpo_raytune.ipynb
diff --git a/tests/odh/resources/requirements.txt b/tests/odh/resources/requirements.txt