@@ -20,11 +20,15 @@ package tests
2020
2121import (
2222 "fmt"
23+ "io"
24+ "os"
25+ "path/filepath"
2326 "sort"
2427 "strings"
2528 "time"
2629
2730 "github.com/ai-dynamo/grove/operator/e2e/setup"
31+ "github.com/ai-dynamo/grove/operator/e2e/utils"
2832 corev1 "k8s.io/api/core/v1"
2933 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3034 "k8s.io/apimachinery/pkg/runtime/schema"
@@ -37,6 +41,11 @@ const (
3741
3842 // eventLookbackDuration is how far back to look for events
3943 eventLookbackDuration = 10 * time .Minute
44+
45+ // DiagnosticsToStdoutEnvVar is the environment variable that controls diagnostics output.
46+ // If set to any non-empty value, diagnostics are printed to stdout.
47+ // If not set or empty, diagnostics are written to a timestamped file.
48+ DiagnosticsToStdoutEnvVar = "GROVE_E2E_DIAG_TO_STDOUT"
4049)
4150
4251// isPodReady checks if a pod is ready
@@ -64,28 +73,92 @@ var groveResourceTypes = []groveResourceType{
6473 {"PodGangs" , schema.GroupVersionResource {Group : "scheduler.grove.io" , Version : "v1alpha1" , Resource : "podgangs" }, "PODGANG" },
6574}
6675
76+ // createDiagnosticsWriter creates an io.Writer for diagnostics output.
77+ // If GROVE_E2E_DIAG_TO_STDOUT is set, returns os.Stdout.
78+ // Otherwise, creates a timestamped file using the test name in the current directory.
79+ // The caller is responsible for closing the returned io.Closer (may be nil for stdout).
80+ func createDiagnosticsWriter (testName string ) (io.Writer , io.Closer , string , error ) {
81+ if os .Getenv (DiagnosticsToStdoutEnvVar ) != "" {
82+ return os .Stdout , nil , "" , nil
83+ }
84+
85+ // Sanitize test name for use in filename (replace / with _)
86+ sanitizedName := strings .ReplaceAll (testName , "/" , "_" )
87+
88+ // Create a timestamped file with test name
89+ timestamp := time .Now ().Format ("2006-01-02_15-04-05" )
90+ filename := fmt .Sprintf ("%s_%s.log" , sanitizedName , timestamp )
91+
92+ // Try to create the file in the current directory
93+ file , err := os .Create (filename )
94+ if err != nil {
95+ // Fall back to a temp directory if we can't write to current dir
96+ filename = filepath .Join (os .TempDir (), filename )
97+ file , err = os .Create (filename )
98+ if err != nil {
99+ return nil , nil , "" , fmt .Errorf ("failed to create diagnostics file: %w" , err )
100+ }
101+ }
102+
103+ return file , file , filename , nil
104+ }
105+
67106// CollectAllDiagnostics collects and prints all diagnostic information at INFO level.
68107// This should be called when a test fails, before cleanup runs.
69108// All output is at INFO level to ensure visibility regardless of log level settings.
109+ //
110+ // Diagnostics are written to a timestamped file using the test name (e.g., TestRollingUpdate_2025-01-22_15-04-05.log).
111+ // Set the DiagnosticsToStdoutEnvVar environment variable to output to stdout instead.
70112func CollectAllDiagnostics (tc TestContext ) {
113+ // Get test name for the diagnostics file
114+ testName := "unknown_test"
115+ if tc .T != nil {
116+ testName = tc .T .Name ()
117+ }
118+
119+ // Create diagnostics output writer
120+ writer , closer , filename , err := createDiagnosticsWriter (testName )
121+ if err != nil {
122+ logger .Errorf ("Failed to create diagnostics writer, falling back to stdout: %v" , err )
123+ writer = os .Stdout
124+ filename = ""
125+ }
126+ if closer != nil {
127+ defer closer .Close ()
128+ }
129+
130+ // Save reference to stdout logger, then shadow with diagnostics logger
131+ stdoutLogger := logger
132+ logger := utils .NewTestLoggerWithOutput (utils .InfoLevel , writer )
133+
134+ // Log where diagnostics are being written (to main test output)
135+ if filename != "" {
136+ stdoutLogger .Infof ("Writing diagnostics to file: %s" , filename )
137+ }
138+
71139 logger .Info ("================================================================================" )
72140 logger .Info ("=== COLLECTING FAILURE DIAGNOSTICS ===" )
73141 logger .Info ("================================================================================" )
74142
75143 // Collect each type of diagnostic, continuing even if one fails
76- dumpOperatorLogs (tc )
77- dumpGroveResources (tc )
78- dumpPodDetails (tc )
79- dumpRecentEvents (tc )
144+ dumpOperatorLogs (tc , logger )
145+ dumpGroveResources (tc , logger )
146+ dumpPodDetails (tc , logger )
147+ dumpRecentEvents (tc , logger )
80148
81149 logger .Info ("================================================================================" )
82150 logger .Info ("=== END OF FAILURE DIAGNOSTICS ===" )
83151 logger .Info ("================================================================================" )
152+
153+ // Log completion message (to main test output)
154+ if filename != "" {
155+ stdoutLogger .Infof ("Diagnostics collection complete. Output written to: %s" , filename )
156+ }
84157}
85158
86159// dumpOperatorLogs captures and prints operator logs at INFO level.
87160// Captures all logs from all containers in the operator pod.
88- func dumpOperatorLogs (tc TestContext ) {
161+ func dumpOperatorLogs (tc TestContext , logger * utils. Logger ) {
89162 logger .Info ("================================================================================" )
90163 logger .Info ("=== OPERATOR LOGS (all) ===" )
91164 logger .Info ("================================================================================" )
@@ -175,7 +248,7 @@ func dumpOperatorLogs(tc TestContext) {
175248}
176249
177250// dumpGroveResources dumps all Grove resources as YAML at INFO level.
178- func dumpGroveResources (tc TestContext ) {
251+ func dumpGroveResources (tc TestContext , logger * utils. Logger ) {
179252 logger .Info ("================================================================================" )
180253 logger .Info ("=== GROVE RESOURCES ===" )
181254 logger .Info ("================================================================================" )
@@ -221,7 +294,7 @@ func dumpGroveResources(tc TestContext) {
221294// dumpPodDetails dumps detailed pod information at INFO level.
222295// Lists ALL pods in the namespace (not filtered by workload label selector)
223296// to ensure we capture all relevant pods during failure diagnostics.
224- func dumpPodDetails (tc TestContext ) {
297+ func dumpPodDetails (tc TestContext , logger * utils. Logger ) {
225298 logger .Info ("================================================================================" )
226299 logger .Info ("=== POD DETAILS ===" )
227300 logger .Info ("================================================================================" )
@@ -305,7 +378,7 @@ func dumpPodDetails(tc TestContext) {
305378}
306379
307380// dumpRecentEvents dumps Kubernetes events from the last eventLookbackDuration at INFO level.
308- func dumpRecentEvents (tc TestContext ) {
381+ func dumpRecentEvents (tc TestContext , logger * utils. Logger ) {
309382 logger .Info ("================================================================================" )
310383 logger .Infof ("=== KUBERNETES EVENTS (last %v) ===" , eventLookbackDuration )
311384 logger .Info ("================================================================================" )
0 commit comments