diff --git a/tests/common/support/dscInitialization.go b/tests/common/support/dscInitialization.go index f13eec555..6818faacd 100644 --- a/tests/common/support/dscInitialization.go +++ b/tests/common/support/dscInitialization.go @@ -47,6 +47,24 @@ func GetApplicationsNamespace(test Test) (string, error) { return GetApplicationsNamespaceFromDSCI(test, DefaultDSCIName) } +func GetRHOAIVersionFromDSCI(test Test) string { + dsci, err := GetDSCI(test, DefaultDSCIName) + if err != nil { + test.T().Logf("Failed to get DSCI for version: %v", err) + return "" + } + version, found, err := unstructured.NestedString(dsci.Object, "status", "release", "version") + if err != nil { + test.T().Logf("Failed to read status.release.version from DSCI %s: %v", DefaultDSCIName, err) + return "" + } + if !found { + test.T().Logf("DSCI %s is missing status.release.version", DefaultDSCIName) + return "" + } + return version +} + func GetApplicationsNamespaceFromDSCI(test Test, dsciName string) (string, error) { dsci, err := GetDSCI(test, dsciName) if err != nil { diff --git a/tests/trainer/README.md b/tests/trainer/README.md index 2e67d7656..69354dcd5 100644 --- a/tests/trainer/README.md +++ b/tests/trainer/README.md @@ -63,6 +63,42 @@ go test ./tests/trainer/ -v go test ./tests/trainer -run TestCustomTrainingRuntimesAvailable -v ``` +## Upgrade Tests + +Upgrade tests validate that Trainer v2 resources survive an RHOAI upgrade. They run in two phases controlled by `TEST_TIER`: + +```bash +# Pre-upgrade: create resources and store baselines +TEST_TIER=Pre-Upgrade go test -v -timeout 10m ./tests/trainer/ + +# ... perform RHOAI upgrade ... + +# Post-upgrade: verify resources survived and complete workloads +TEST_TIER=Post-Upgrade go test -v -timeout 10m ./tests/trainer/ +``` + +### Test Coverage + +| Test Pair | What it validates | +|-----------|-------------------| +| `TestSetupSleepTrainJob` / `TestVerifySleepTrainJob` | Running TrainJob survives upgrade with zero pod restarts | +| `TestSetupTrainingRuntime` / `TestVerifyTrainingRuntime` | Custom namespace-scoped TrainingRuntime persists, spec unchanged | +| `TestSetupCustomRuntimeUpgradeTrainJob` / `TestRunCustomRuntimeUpgradeTrainJob` | Custom ClusterTrainingRuntime + Kueue suspend/resume lifecycle | + +### Spec Integrity Checks + +Post-upgrade tests compare resource `metadata.generation` against pre-upgrade baselines stored in ConfigMaps. When generation changes (indicating a spec mutation), before/after specs are logged as JSON for analysis. The assertion is version-aware — an explicit allowlist in [`utils/utils_upgrade.go`](utils/utils_upgrade.go) defines upgrade paths where spec mutations are expected (e.g., API changes across minor versions). The RHOAI version is read from DSCI `status.release.version`. + +### Known Limitations + +- **RHOAIENG-48867**: 4 Kueue suspend/resume tests are skipped because the Trainer controller fails updating immutable JobSet `spec.replicatedJobs` when built-in ClusterTrainingRuntime specs change during upgrade. Only affects suspended jobs referencing default/versioned runtimes — running jobs and custom runtimes are not impacted. +- Tests are version-agnostic — which upgrade path is tested depends on Jenkins pipeline deployment configuration. + +### Maintenance + +- When Trainer API changes introduce spec mutations during upgrade, add the version pair to `specMutationExpectedPaths` in [`utils/utils_upgrade.go`](utils/utils_upgrade.go). +- When RHOAIENG-48867 is fixed upstream, remove the `t.Skip` calls in `trainer_kueue_upgrade_training_test.go` to enable the default and specific runtime Kueue tests. + ## GPU Requirements > **Note:** The TrainingHub SDK tests (`TestOsftTrainingHubMultiNodeMultiGPU`, `TestLoraTrainingHubMultiNodeMultiGPU`, `TestSftTrainingHubMultiNodeMultiGPU`) require **NVIDIA Ampere or newer GPUs** (e.g. A100, H100). The training runtime image (`odh-training-cuda128-torch29-py312-rhel9`, referenced as `DefaultTrainingHubRuntimeCUDA` in [`tests/trainer/utils/utils_runtimes.go`](utils/utils_runtimes.go)) ships with `flash_attn==2.8.3`, which requires compute capability >= 8.0. These tests will not work on pre-Ampere GPUs such as T4 or V100. diff --git a/tests/trainer/trainer_kueue_upgrade_training_test.go b/tests/trainer/trainer_kueue_upgrade_training_test.go index 6c2dfdbd3..1fc9a8b17 100644 --- a/tests/trainer/trainer_kueue_upgrade_training_test.go +++ b/tests/trainer/trainer_kueue_upgrade_training_test.go @@ -17,6 +17,8 @@ limitations under the License. package trainer import ( + "encoding/json" + "fmt" "strings" "testing" @@ -39,11 +41,15 @@ import ( ) var ( - upgradeNamespaceName = "test-trainer-upgrade" - resourceFlavorName = "rf-trainer-upgrade" - clusterQueueName = "cq-trainer-upgrade" - localQueueName = "lq-trainer-upgrade" - upgradeTrainJobName = "trainjob-upgrade" + upgradeNamespaceName = "test-trainer-upgrade" + resourceFlavorName = "rf-trainer-upgrade" + clusterQueueName = "cq-trainer-upgrade" + localQueueName = "lq-trainer-upgrade" + upgradeTrainJobName = "trainjob-upgrade" + upgradeConfigMapName = "default-runtime-upgrade-baseline" + upgradeTrainJobGenKey = "trainjob-generation" + upgradeTrainJobSpecKey = "trainjob-spec" + rhoaiVersionKey = "rhoai-version" // Specific runtime upgrade test variables specificRuntimeNamespaceName = "test-trainer-upgrade-specific" @@ -53,6 +59,10 @@ var ( specificRuntimeTrainJobName = "trainjob-upgrade-specific" specificRuntimeConfigMapName = "specific-runtime-upgrade" specificRuntimeConfigMapKey = "runtime-name" + specificRuntimeGenerationKey = "runtime-generation" + specificRuntimeSpecKey = "runtime-spec" + specificTrainJobGenerationKey = "trainjob-generation" + specificTrainJobSpecKey = "trainjob-spec" // Custom runtime upgrade test variables customRuntimeNamespaceName = "test-trainer-upgrade-custom-rt" @@ -61,6 +71,11 @@ var ( customRuntimeLocalQueue = "lq-trainer-upgrade-custom-rt" customRuntimeTrainJobName = "trainjob-upgrade-custom-rt" customRuntimeCTRName = "custom-upgrade-runtime" + customRuntimeConfigMapName = "custom-runtime-upgrade-baseline" + customRuntimeGenerationKey = "ctr-generation" + customRuntimeSpecKey = "ctr-spec" + customTrainJobGenerationKey = "trainjob-generation" + customTrainJobSpecKey = "trainjob-spec" ) func TestSetupUpgradeTrainJob(t *testing.T) { @@ -130,6 +145,13 @@ func TestSetupUpgradeTrainJob(t *testing.T) { test.Eventually(TrainJob(test, trainJob.Namespace, upgradeTrainJobName), TestTimeoutShort). Should(WithTransform(TrainJobConditionSuspended, Equal(metav1.ConditionTrue))) test.T().Logf("TrainJob %s/%s is suspended, waiting for ClusterQueue to be enabled after upgrade", trainJob.Namespace, upgradeTrainJobName) + + // Store TrainJob baseline for post-upgrade integrity check + trainJob, err = test.Client().Trainer().TrainerV1alpha1().TrainJobs(upgradeNamespaceName).Get(test.Ctx(), upgradeTrainJobName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + data := map[string]string{} + addResourceBaseline(test, data, upgradeTrainJobGenKey, upgradeTrainJobSpecKey, trainJob.Generation, trainJob.Spec) + storeUpgradeBaseline(test, upgradeNamespaceName, upgradeConfigMapName, data) } func TestRunUpgradeTrainJob(t *testing.T) { @@ -144,9 +166,20 @@ func TestRunUpgradeTrainJob(t *testing.T) { defer test.Client().Kueue().KueueV1beta2().ClusterQueues().Delete(test.Ctx(), clusterQueueName, metav1.DeleteOptions{}) defer DeleteTestNamespace(test, namespace) + // Check TrainJob spec integrity + configMap, err := test.Client().Core().CoreV1().ConfigMaps(upgradeNamespaceName).Get( + test.Ctx(), upgradeConfigMapName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred(), "Baseline ConfigMap should exist") + + trainJob, err := test.Client().Trainer().TrainerV1alpha1().TrainJobs(upgradeNamespaceName).Get(test.Ctx(), upgradeTrainJobName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred(), "TrainJob should exist after upgrade") + + verifySpecIntegrity(test, "TrainJob", trainJob.Generation, trainJob.Spec, + configMap, upgradeTrainJobGenKey, upgradeTrainJobSpecKey) + // Enable ClusterQueue to process waiting TrainJob clusterQueue := kueueacv1beta2.ClusterQueue(clusterQueueName).WithSpec(kueueacv1beta2.ClusterQueueSpec().WithStopPolicy(kueuev1beta2.None)) - _, err := test.Client().Kueue().KueueV1beta2().ClusterQueues().Apply(test.Ctx(), clusterQueue, metav1.ApplyOptions{FieldManager: "application/apply-patch", Force: true}) + _, err = test.Client().Kueue().KueueV1beta2().ClusterQueues().Apply(test.Ctx(), clusterQueue, metav1.ApplyOptions{FieldManager: "application/apply-patch", Force: true}) test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Enabled ClusterQueue %s by setting StopPolicy to None", clusterQueueName) @@ -186,8 +219,11 @@ func TestSetupSpecificRuntimeUpgradeTrainJob(t *testing.T) { CreateOrGetTestNamespaceWithName(test, specificRuntimeNamespaceName, WithKueueManaged()) test.T().Logf("Created Kueue-managed namespace: %s", specificRuntimeNamespaceName) - // Store the runtime name in ConfigMap for post-upgrade verification - storeSpecificRuntimeInConfigMap(test, specificRuntime) + // Store the runtime baseline in ConfigMap for post-upgrade verification + ctr, err := test.Client().Trainer().TrainerV1alpha1().ClusterTrainingRuntimes().Get(test.Ctx(), specificRuntime, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + baselineData := map[string]string{specificRuntimeConfigMapKey: specificRuntime} + addResourceBaseline(test, baselineData, specificRuntimeGenerationKey, specificRuntimeSpecKey, ctr.Generation, ctr.Spec) // Create Kueue resources with StopPolicy=Hold resourceFlavor := kueueacv1beta2.ResourceFlavor(specificRuntimeResourceFlavor) @@ -245,6 +281,12 @@ func TestSetupSpecificRuntimeUpgradeTrainJob(t *testing.T) { test.Eventually(TrainJob(test, trainJob.Namespace, specificRuntimeTrainJobName), TestTimeoutShort). Should(WithTransform(TrainJobConditionSuspended, Equal(metav1.ConditionTrue))) test.T().Logf("TrainJob %s/%s using runtime %s is suspended, waiting for upgrade", trainJob.Namespace, specificRuntimeTrainJobName, specificRuntime) + + // Store TrainJob baseline and persist ConfigMap + trainJob, err = test.Client().Trainer().TrainerV1alpha1().TrainJobs(specificRuntimeNamespaceName).Get(test.Ctx(), specificRuntimeTrainJobName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + addResourceBaseline(test, baselineData, specificTrainJobGenerationKey, specificTrainJobSpecKey, trainJob.Generation, trainJob.Spec) + storeUpgradeBaseline(test, specificRuntimeNamespaceName, specificRuntimeConfigMapName, baselineData) } // TestRunSpecificRuntimeUpgradeTrainJob verifies that a TrainJob using a specific cluster training runtime @@ -272,8 +314,13 @@ func TestRunSpecificRuntimeUpgradeTrainJob(t *testing.T) { DeleteTestNamespace(test, namespace) }() - // Verify the ClusterTrainingRuntime still exists - _, err := test.Client().Trainer().TrainerV1alpha1().ClusterTrainingRuntimes().Get(test.Ctx(), specificRuntime, metav1.GetOptions{}) + // Load baselines from ConfigMap + configMap, err := test.Client().Core().CoreV1().ConfigMaps(specificRuntimeNamespaceName).Get( + test.Ctx(), specificRuntimeConfigMapName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred(), "Baseline ConfigMap should exist") + + // Check ClusterTrainingRuntime spec integrity + ctr, err := test.Client().Trainer().TrainerV1alpha1().ClusterTrainingRuntimes().Get(test.Ctx(), specificRuntime, metav1.GetOptions{}) if err != nil { if errors.IsNotFound(err) { test.T().Logf("ClusterTrainingRuntime %s was removed during upgrade (expected for versioned runtimes)", specificRuntime) @@ -282,13 +329,18 @@ func TestRunSpecificRuntimeUpgradeTrainJob(t *testing.T) { } } else { test.T().Logf("ClusterTrainingRuntime %s still exists after upgrade", specificRuntime) + verifySpecIntegrity(test, "ClusterTrainingRuntime", ctr.Generation, ctr.Spec, + configMap, specificRuntimeGenerationKey, specificRuntimeSpecKey) } - // Verify TrainJob is still suspended + // Check TrainJob spec integrity trainJob, err := test.Client().Trainer().TrainerV1alpha1().TrainJobs(specificRuntimeNamespaceName).Get(test.Ctx(), specificRuntimeTrainJobName, metav1.GetOptions{}) test.Expect(err).NotTo(HaveOccurred(), "TrainJob should exist after upgrade") test.T().Logf("TrainJob %s/%s exists after upgrade with RuntimeRef: %s", trainJob.Namespace, trainJob.Name, trainJob.Spec.RuntimeRef.Name) + verifySpecIntegrity(test, "TrainJob", trainJob.Generation, trainJob.Spec, + configMap, specificTrainJobGenerationKey, specificTrainJobSpecKey) + // Enable ClusterQueue to process the TrainJob clusterQueue := kueueacv1beta2.ClusterQueue(specificRuntimeClusterQueue).WithSpec(kueueacv1beta2.ClusterQueueSpec().WithStopPolicy(kueuev1beta2.None)) _, err = test.Client().Kueue().KueueV1beta2().ClusterQueues().Apply(test.Ctx(), clusterQueue, metav1.ApplyOptions{FieldManager: "application/apply-patch", Force: true}) @@ -369,8 +421,8 @@ func TestSetupCustomRuntimeUpgradeTrainJob(t *testing.T) { test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Applied Kueue LocalQueue %s/%s successfully", appliedLocalQueue.Namespace, appliedLocalQueue.Name) - // Create TrainJob using the custom CTR - trainJob := createUpgradeTrainJob(test, customRuntimeNamespaceName, appliedLocalQueue.Name, customRuntimeTrainJobName, customRuntimeCTRName) + // Create TrainJob using the custom CTR with PodTemplateOverrides to exercise API surface coverage + trainJob := createCustomRuntimeUpgradeTrainJob(test, customRuntimeNamespaceName, appliedLocalQueue.Name) // Verify Kueue Workload is Inadmissible var workloadName string @@ -389,6 +441,17 @@ func TestSetupCustomRuntimeUpgradeTrainJob(t *testing.T) { test.Eventually(TrainJob(test, trainJob.Namespace, customRuntimeTrainJobName), TestTimeoutShort). Should(WithTransform(TrainJobConditionSuspended, Equal(metav1.ConditionTrue))) test.T().Logf("TrainJob %s/%s using custom runtime %s is suspended, waiting for upgrade", trainJob.Namespace, customRuntimeTrainJobName, customRuntimeCTRName) + + // Store baselines for post-upgrade integrity check + ctr, err := test.Client().Trainer().TrainerV1alpha1().ClusterTrainingRuntimes().Get(test.Ctx(), customRuntimeCTRName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + trainJob, err = test.Client().Trainer().TrainerV1alpha1().TrainJobs(customRuntimeNamespaceName).Get(test.Ctx(), customRuntimeTrainJobName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + data := map[string]string{} + addResourceBaseline(test, data, customRuntimeGenerationKey, customRuntimeSpecKey, ctr.Generation, ctr.Spec) + addResourceBaseline(test, data, customTrainJobGenerationKey, customTrainJobSpecKey, trainJob.Generation, trainJob.Spec) + storeUpgradeBaseline(test, customRuntimeNamespaceName, customRuntimeConfigMapName, data) } func TestRunCustomRuntimeUpgradeTrainJob(t *testing.T) { @@ -402,14 +465,30 @@ func TestRunCustomRuntimeUpgradeTrainJob(t *testing.T) { _ = test.Client().Kueue().KueueV1beta2().ResourceFlavors().Delete(test.Ctx(), customRuntimeResourceFlavor, metav1.DeleteOptions{}) _ = test.Client().Kueue().KueueV1beta2().ClusterQueues().Delete(test.Ctx(), customRuntimeClusterQueue, metav1.DeleteOptions{}) _ = test.Client().Trainer().TrainerV1alpha1().ClusterTrainingRuntimes().Delete(test.Ctx(), customRuntimeCTRName, metav1.DeleteOptions{}) + _ = test.Client().Core().CoreV1().ConfigMaps(customRuntimeNamespaceName).Delete(test.Ctx(), customRuntimeConfigMapName, metav1.DeleteOptions{}) DeleteTestNamespace(test, namespace) }() - // Verify custom CTR still exists after upgrade - _, err := test.Client().Trainer().TrainerV1alpha1().ClusterTrainingRuntimes().Get(test.Ctx(), customRuntimeCTRName, metav1.GetOptions{}) + // Load baselines from ConfigMap + configMap, err := test.Client().Core().CoreV1().ConfigMaps(customRuntimeNamespaceName).Get( + test.Ctx(), customRuntimeConfigMapName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred(), "Baseline ConfigMap should exist") + + // Check custom CTR spec integrity + ctr, err := test.Client().Trainer().TrainerV1alpha1().ClusterTrainingRuntimes().Get(test.Ctx(), customRuntimeCTRName, metav1.GetOptions{}) test.Expect(err).NotTo(HaveOccurred(), "Custom ClusterTrainingRuntime should exist after upgrade") test.T().Logf("Custom ClusterTrainingRuntime %s is preserved after upgrade", customRuntimeCTRName) + verifySpecIntegrity(test, "Custom CTR", ctr.Generation, ctr.Spec, + configMap, customRuntimeGenerationKey, customRuntimeSpecKey) + + // Check TrainJob spec integrity + trainJob, err := test.Client().Trainer().TrainerV1alpha1().TrainJobs(customRuntimeNamespaceName).Get(test.Ctx(), customRuntimeTrainJobName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred(), "TrainJob should exist after upgrade") + + verifySpecIntegrity(test, "TrainJob", trainJob.Generation, trainJob.Spec, + configMap, customTrainJobGenerationKey, customTrainJobSpecKey) + // Enable ClusterQueue to process the TrainJob clusterQueue := kueueacv1beta2.ClusterQueue(customRuntimeClusterQueue).WithSpec(kueueacv1beta2.ClusterQueueSpec().WithStopPolicy(kueuev1beta2.None)) _, err = test.Client().Kueue().KueueV1beta2().ClusterQueues().Apply(test.Ctx(), clusterQueue, metav1.ApplyOptions{FieldManager: "application/apply-patch", Force: true}) @@ -435,6 +514,26 @@ func TestRunCustomRuntimeUpgradeTrainJob(t *testing.T) { // Helper functions +func verifySpecIntegrity(test Test, resourceName string, generation int64, spec interface{}, + configMap *corev1.ConfigMap, genKey, specKey string) { + expectedGen := configMap.Data[genKey] + actualGen := fmt.Sprintf("%d", generation) + if actualGen != expectedGen { + preVersion := configMap.Data[rhoaiVersionKey] + postVersion := GetRHOAIVersionFromDSCI(test) + currentSpecJSON, _ := json.Marshal(spec) + test.T().Logf("%s generation changed during upgrade (%s to %s)", resourceName, expectedGen, actualGen) + test.T().Logf("Pre-upgrade %s spec: %s", resourceName, configMap.Data[specKey]) + test.T().Logf("Post-upgrade %s spec: %s", resourceName, currentSpecJSON) + test.Expect(preVersion).NotTo(BeEmpty(), "Pre-upgrade RHOAI version missing from baseline ConfigMap") + test.Expect(postVersion).NotTo(BeEmpty(), "Post-upgrade RHOAI version not available from DSCI") + test.Expect(trainerutils.IsSpecMutationExpected(preVersion, postVersion)).To(BeTrue(), + "Unexpected %s spec mutation for upgrade %s → %s", resourceName, preVersion, postVersion) + } else { + test.T().Logf("%s generation unchanged after upgrade: %s", resourceName, actualGen) + } +} + func createUpgradeTrainJob(test Test, namespace, localQueueName, jobName, runtimeName string) *trainerv1alpha1.TrainJob { // Delete existing TrainJob if present _, err := test.Client().Trainer().TrainerV1alpha1().TrainJobs(namespace).Get(test.Ctx(), jobName, metav1.GetOptions{}) @@ -474,6 +573,65 @@ func createUpgradeTrainJob(test Test, namespace, localQueueName, jobName, runtim return trainJob } +func createCustomRuntimeUpgradeTrainJob(test Test, namespace, localQueueName string) *trainerv1alpha1.TrainJob { + _, err := test.Client().Trainer().TrainerV1alpha1().TrainJobs(namespace).Get(test.Ctx(), customRuntimeTrainJobName, metav1.GetOptions{}) + if err == nil { + err := test.Client().Trainer().TrainerV1alpha1().TrainJobs(namespace).Delete(test.Ctx(), customRuntimeTrainJobName, metav1.DeleteOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.Eventually(TrainJobs(test, namespace), TestTimeoutShort).Should(BeEmpty()) + } else if !errors.IsNotFound(err) { + test.T().Fatalf("Error retrieving TrainJob with name `%s`: %v", customRuntimeTrainJobName, err) + } + + trainJob := &trainerv1alpha1.TrainJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: customRuntimeTrainJobName, + Labels: map[string]string{ + "kueue.x-k8s.io/queue-name": localQueueName, + }, + }, + Spec: trainerv1alpha1.TrainJobSpec{ + RuntimeRef: trainerv1alpha1.RuntimeRef{ + Name: customRuntimeCTRName, + }, + Trainer: &trainerv1alpha1.Trainer{ + Command: []string{ + "python", + "-c", + "import torch; print(f'PyTorch version: {torch.__version__}'); import time; time.sleep(5); print('Training completed successfully')", + }, + }, + PodTemplateOverrides: []trainerv1alpha1.PodTemplateOverride{ + { + TargetJobs: []trainerv1alpha1.PodTemplateOverrideTargetJob{ + {Name: "node"}, + }, + Metadata: &metav1.ObjectMeta{ + Labels: map[string]string{ + "upgrade-test": "custom-runtime", + }, + }, + Spec: &trainerv1alpha1.PodTemplateSpecOverride{ + Tolerations: []corev1.Toleration{ + { + Key: "upgrade-test", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }, + }, + }, + }, + }, + }, + } + + trainJob, err = test.Client().Trainer().TrainerV1alpha1().TrainJobs(namespace).Create(test.Ctx(), trainJob, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created TrainJob %s/%s with runtime %s and PodTemplateOverrides", trainJob.Namespace, trainJob.Name, customRuntimeCTRName) + + return trainJob +} + func findSpecificRuntime(test Test) string { runtimes, err := test.Client().Trainer().TrainerV1alpha1().ClusterTrainingRuntimes().List(test.Ctx(), metav1.ListOptions{}) test.Expect(err).NotTo(HaveOccurred(), "Failed to list ClusterTrainingRuntimes") @@ -495,24 +653,26 @@ func findSpecificRuntime(test Test) string { return specificRuntimes[0] } -// storeSpecificRuntimeInConfigMap stores the specific runtime name for post-upgrade verification -func storeSpecificRuntimeInConfigMap(test Test, runtimeName string) { +func addResourceBaseline(test Test, data map[string]string, genKey, specKey string, generation int64, spec interface{}) { + specJSON, err := json.Marshal(spec) + test.Expect(err).NotTo(HaveOccurred()) + data[genKey] = fmt.Sprintf("%d", generation) + data[specKey] = string(specJSON) +} + +func storeUpgradeBaseline(test Test, namespace, configMapName string, data map[string]string) { + data[rhoaiVersionKey] = GetRHOAIVersionFromDSCI(test) configMap := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ - Name: specificRuntimeConfigMapName, - Namespace: specificRuntimeNamespaceName, - }, - Data: map[string]string{ - specificRuntimeConfigMapKey: runtimeName, + Name: configMapName, + Namespace: namespace, }, + Data: data, } - - // Delete existing ConfigMap if present - _ = test.Client().Core().CoreV1().ConfigMaps(specificRuntimeNamespaceName).Delete(test.Ctx(), specificRuntimeConfigMapName, metav1.DeleteOptions{}) - - _, err := test.Client().Core().CoreV1().ConfigMaps(specificRuntimeNamespaceName).Create(test.Ctx(), configMap, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred(), "Failed to create ConfigMap for specific runtime") - test.T().Logf("Stored specific runtime name in ConfigMap %s/%s: %s", specificRuntimeNamespaceName, specificRuntimeConfigMapName, runtimeName) + _ = test.Client().Core().CoreV1().ConfigMaps(namespace).Delete(test.Ctx(), configMapName, metav1.DeleteOptions{}) + _, err := test.Client().Core().CoreV1().ConfigMaps(namespace).Create(test.Ctx(), configMap, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Stored upgrade baseline in ConfigMap %s/%s", namespace, configMapName) } // getSpecificRuntimeFromConfigMap retrieves the specific runtime name from ConfigMap diff --git a/tests/trainer/trainer_trainingruntime_upgrade_test.go b/tests/trainer/trainer_trainingruntime_upgrade_test.go index 952677466..a7f5086b5 100644 --- a/tests/trainer/trainer_trainingruntime_upgrade_test.go +++ b/tests/trainer/trainer_trainingruntime_upgrade_test.go @@ -35,6 +35,9 @@ import ( var ( runtimeNamespaceName = "test-trainer-upgrade-runtime" customRuntimeName = "custom-sleep-runtime" + runtimeConfigMapName = "runtime-upgrade-baseline" + runtimeGenerationKey = "runtime-generation" + runtimeSpecKey = "runtime-spec" ) func TestSetupTrainingRuntime(t *testing.T) { @@ -53,6 +56,11 @@ func TestSetupTrainingRuntime(t *testing.T) { test.Expect(err).NotTo(HaveOccurred()) test.Expect(runtime.Name).To(Equal(customRuntimeName)) test.T().Logf("Custom TrainingRuntime %s/%s created successfully", runtimeNamespaceName, customRuntimeName) + + // Store baseline for post-upgrade verification + data := map[string]string{} + addResourceBaseline(test, data, runtimeGenerationKey, runtimeSpecKey, runtime.Generation, runtime.Spec) + storeUpgradeBaseline(test, runtimeNamespaceName, runtimeConfigMapName, data) } func TestVerifyTrainingRuntime(t *testing.T) { @@ -75,6 +83,18 @@ func TestVerifyTrainingRuntime(t *testing.T) { test.Expect(runtimeNames).To(ContainElement(customRuntimeName), "Custom TrainingRuntime should exist after upgrade. Found runtimes: %v", runtimeNames) test.T().Logf("TrainingRuntime %s/%s is preserved after upgrade", runtimeNamespaceName, customRuntimeName) + + // Check spec integrity across upgrade + runtime, err := test.Client().Trainer().TrainerV1alpha1().TrainingRuntimes(runtimeNamespaceName).Get( + test.Ctx(), customRuntimeName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + configMap, err := test.Client().Core().CoreV1().ConfigMaps(runtimeNamespaceName).Get( + test.Ctx(), runtimeConfigMapName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred(), "Baseline ConfigMap should exist") + + verifySpecIntegrity(test, "TrainingRuntime", runtime.Generation, runtime.Spec, + configMap, runtimeGenerationKey, runtimeSpecKey) } func createCustomTrainingRuntime(test Test, namespace string) *trainerv1alpha1.TrainingRuntime { diff --git a/tests/trainer/utils/utils_upgrade.go b/tests/trainer/utils/utils_upgrade.go new file mode 100644 index 000000000..e8c5b6e1e --- /dev/null +++ b/tests/trainer/utils/utils_upgrade.go @@ -0,0 +1,50 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package trainer + +import "strings" + +// specMutationExpectedPaths lists from→to minor version pairs where Trainer API changes +// are known to mutate existing TrainJob/TrainingRuntime specs during upgrade. +var specMutationExpectedPaths = [][2]string{ + // kubeflow/trainer#3309: PodTemplateOverrides → RuntimePatches + // {"3.4", "3.5"}, +} + +// IsSpecMutationExpected returns true if the upgrade path from→to is known to mutate specs. +// Versions are compared by major.minor only. +func IsSpecMutationExpected(fromVersion, toVersion string) bool { + fromMinor := majorMinor(fromVersion) + toMinor := majorMinor(toVersion) + if fromMinor == "" || toMinor == "" { + return false + } + for _, pair := range specMutationExpectedPaths { + if pair[0] == fromMinor && pair[1] == toMinor { + return true + } + } + return false +} + +func majorMinor(version string) string { + parts := strings.SplitN(version, ".", 3) + if len(parts) < 2 { + return "" + } + return parts[0] + "." + parts[1] +}