@@ -121,7 +121,6 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
121121
122122 // Create Kueue resources
123123 resourceFlavor := CreateKueueResourceFlavor (test , v1beta1.ResourceFlavorSpec {})
124- fmt .Sprintln (gpu .ResourceLabel )
125124 defer test .Client ().Kueue ().KueueV1beta1 ().ResourceFlavors ().Delete (test .Ctx (), resourceFlavor .Name , metav1.DeleteOptions {})
126125 cqSpec := v1beta1.ClusterQueueSpec {
127126 NamespaceSelector : & metav1.LabelSelector {},
@@ -191,13 +190,13 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
191190
192191 for _ , trainingPod := range trainingPods {
193192 // Check that GPUs for training pods were utilized recently
194- test .Eventually (OpenShiftPrometheusGpuUtil (test , trainingPod , gpu ), 15 * time .Minute ).
193+ test .Eventually (OpenShiftPrometheusGpuUtil (test , trainingPod , gpu ), 10 * time .Minute ).
195194 Should (
196195 And (
197196 HaveLen (numGpus ),
198197 ContainElement (
199- // Check that at least some GPU was utilized on more than 50 %
200- HaveField ("Value" , BeNumerically (">" , 50 )),
198+ // Check that at least some GPU was utilized on more than 10 %
199+ HaveField ("Value" , BeNumerically (">" , 10 )),
201200 ),
202201 ),
203202 )
@@ -206,7 +205,7 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
206205 }
207206
208207 // Make sure the PyTorch job succeeded
209- test .Eventually (PyTorchJob (test , namespace , tuningJob .Name ), TestTimeoutGpuProvisioning ).Should (WithTransform (PyTorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
208+ test .Eventually (PyTorchJob (test , namespace , tuningJob .Name ), TestTimeoutLong ).Should (WithTransform (PyTorchJobConditionSucceeded , Equal (corev1 .ConditionTrue )))
210209 test .T ().Logf ("PytorchJob %s/%s ran successfully" , tuningJob .Namespace , tuningJob .Name )
211210}
212211
@@ -292,7 +291,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
292291 `torchrun /etc/config/hf_llm_training.py \
293292 --model_uri /tmp/model/bloom-560m \
294293 --model_dir /tmp/model/bloom-560m \
295- --dataset_file /tmp/all_datasets/alpaca_data_tenth .json \
294+ --dataset_file /tmp/all_datasets/alpaca_data_hundredth .json \
296295 --transformer_type AutoModelForCausalLM \
297296 --training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/tmp/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
298297 --lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'` ,
@@ -461,7 +460,7 @@ func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap,
461460 `torchrun /etc/config/hf_llm_training.py \
462461 --model_uri /tmp/model/bloom-560m \
463462 --model_dir /tmp/model/bloom-560m \
464- --dataset_file /tmp/all_datasets/alpaca_data_tenth .json \
463+ --dataset_file /tmp/all_datasets/alpaca_data_hundredth .json \
465464 --transformer_type AutoModelForCausalLM \
466465 --training_parameters '{"output_dir": "/mnt/output", "per_device_train_batch_size": 8, "num_train_epochs": 3, "logging_dir": "/logs", "eval_strategy": "epoch", "save_strategy": "no"}' \
467466 --lora_config '{"r": 4, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"}'` ,
0 commit comments