Skip to content

Commit 3d7cf81

Browse files
pawelpaszkisutaakar
authored andcommitted
RHOAIENG-60624: fix nvidia GPU tests
1 parent c3c0cb3 commit 3d7cf81

7 files changed

Lines changed: 292 additions & 271 deletions

tests/odh/mnist_ray_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ func mnistRay(t *testing.T, numGpus int, gpuResourceName string, rayImage string
6565
test := With(t)
6666

6767
// Create a namespace
68-
namespace := test.NewTestNamespace()
68+
namespace := test.NewTestNamespace(WithKueueManaged())
6969

7070
// Ensure Notebook ServiceAccount exists (no extra RBAC)
7171
ensureNotebookServiceAccount(test, namespace.Name)

tests/odh/mnist_raytune_hpo_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
4545
test := With(t)
4646

4747
// Creating a namespace
48-
namespace := test.NewTestNamespace()
48+
namespace := test.NewTestNamespace(WithKueueManaged())
4949

5050
// Ensure Notebook ServiceAccount exists (no extra RBAC)
5151
ensureNotebookServiceAccount(test, namespace.Name)
@@ -72,7 +72,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
7272
},
7373
{
7474
Name: corev1.ResourceName("nvidia.com/gpu"),
75-
NominalQuota: resource.MustParse(fmt.Sprint(numGpus)),
75+
NominalQuota: resource.MustParse(fmt.Sprint(numGpus * 2)), // 2 workers x numGpus each
7676
},
7777
},
7878
},

tests/odh/ray_finetune_llm_deepspeed_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int, modelName string, modelC
4444
test := With(t)
4545

4646
// Create a namespace
47-
namespace := test.NewTestNamespace()
47+
namespace := test.NewTestNamespace(WithKueueManaged())
4848
var workingDirectory, err = os.Getwd()
4949
test.Expect(err).ToNot(HaveOccurred())
5050

tests/odh/raytune_oai_mr_grpc_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func raytuneHpo(t *testing.T, numGpus int) {
4646
test := With(t)
4747

4848
// Create a namespace
49-
namespace := test.NewTestNamespace()
49+
namespace := test.NewTestNamespace(WithKueueManaged())
5050

5151
// Ensure Notebook ServiceAccount exists (no extra RBAC)
5252
ensureNotebookServiceAccount(test, namespace.Name)

tests/odh/resources/mnist_hpo.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def train_mnist(config):
171171
model.parameters(), lr=config["lr"], momentum=config["momentum"]
172172
)
173173

174-
while True:
174+
for _ in range(5):
175175
train_func(model, optimizer, train_loader, device)
176176
acc = test_func(model, test_loader, device)
177177
metrics = {"mean_accuracy": acc}
@@ -180,12 +180,33 @@ def train_mnist(config):
180180
if should_checkpoint:
181181
with tempfile.TemporaryDirectory() as tempdir:
182182
torch.save(model.state_dict(), os.path.join(tempdir, "model.pt"))
183-
train.report(metrics, checkpoint=Checkpoint.from_directory(tempdir))
183+
tune.report(metrics, checkpoint=Checkpoint.from_directory(tempdir))
184184
else:
185-
train.report(metrics)
185+
tune.report(metrics)
186186

187187

188188
if __name__ == "__main__":
189+
import os as _os
190+
# Ray 2.35.0's get_air_verbosity() expects int or AirVerbosity enum, but the
191+
# RHOAI cluster sets AIR_VERBOSITY as a plain string. Patch at the source so
192+
# it works regardless of when/how the env-var is re-injected (e.g. via ray.init).
193+
try:
194+
import ray.tune.experimental.output as _ray_output
195+
import ray.tune.tune as _ray_tune_module
196+
_orig_gav = _ray_output.get_air_verbosity
197+
def _fixed_gav(verbose):
198+
if isinstance(verbose, str):
199+
try:
200+
verbose = int(verbose)
201+
except (ValueError, TypeError):
202+
verbose = 2
203+
return _orig_gav(verbose)
204+
_ray_output.get_air_verbosity = _fixed_gav
205+
_ray_tune_module.get_air_verbosity = _fixed_gav
206+
except Exception:
207+
pass
208+
_os.environ.pop("AIR_VERBOSITY", None)
209+
189210
# for early stopping
190211
sched = AsyncHyperBandScheduler()
191212
gpu_value=int("has to be specified")
@@ -198,12 +219,8 @@ def train_mnist(config):
198219
scheduler=sched,
199220
num_samples=5,
200221
),
201-
run_config=train.RunConfig(
222+
run_config=tune.RunConfig(
202223
name="exp",
203-
stop={
204-
"mean_accuracy": 0.98,
205-
"training_iteration": 5,
206-
},
207224
),
208225
param_space={
209226
"lr": tune.loguniform(1e-4, 1e-2),

0 commit comments

Comments
 (0)