Skip to content

Commit 37f5db4

Browse files
committed
make printing diagnostics to stdout opt in, write to file by default
1 parent dae0b7d commit 37f5db4

File tree

2 files changed

+83
-8
lines changed

2 files changed

+83
-8
lines changed

.github/workflows/e2e-test.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ jobs:
8181
skaffold version
8282
8383
- name: Run e2e tests - ${{ matrix.test_name }}
84+
env:
85+
GROVE_E2E_DIAG_TO_STDOUT: "1"
8486
run: |
8587
cd operator
8688
echo "> Preparing charts (copying CRDs)..."

operator/e2e/tests/debug_utils.go

Lines changed: 81 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,15 @@ package tests
2020

2121
import (
2222
"fmt"
23+
"io"
24+
"os"
25+
"path/filepath"
2326
"sort"
2427
"strings"
2528
"time"
2629

2730
"github.com/ai-dynamo/grove/operator/e2e/setup"
31+
"github.com/ai-dynamo/grove/operator/e2e/utils"
2832
corev1 "k8s.io/api/core/v1"
2933
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3034
"k8s.io/apimachinery/pkg/runtime/schema"
@@ -37,6 +41,11 @@ const (
3741

3842
// eventLookbackDuration is how far back to look for events
3943
eventLookbackDuration = 10 * time.Minute
44+
45+
// DiagnosticsToStdoutEnvVar is the environment variable that controls diagnostics output.
46+
// If set to any non-empty value, diagnostics are printed to stdout.
47+
// If not set or empty, diagnostics are written to a timestamped file.
48+
DiagnosticsToStdoutEnvVar = "GROVE_E2E_DIAG_TO_STDOUT"
4049
)
4150

4251
// isPodReady checks if a pod is ready
@@ -64,28 +73,92 @@ var groveResourceTypes = []groveResourceType{
6473
{"PodGangs", schema.GroupVersionResource{Group: "scheduler.grove.io", Version: "v1alpha1", Resource: "podgangs"}, "PODGANG"},
6574
}
6675

76+
// createDiagnosticsWriter creates an io.Writer for diagnostics output.
77+
// If GROVE_E2E_DIAG_TO_STDOUT is set, returns os.Stdout.
78+
// Otherwise, creates a timestamped file using the test name in the current directory.
79+
// The caller is responsible for closing the returned io.Closer (may be nil for stdout).
80+
func createDiagnosticsWriter(testName string) (io.Writer, io.Closer, string, error) {
81+
if os.Getenv(DiagnosticsToStdoutEnvVar) != "" {
82+
return os.Stdout, nil, "", nil
83+
}
84+
85+
// Sanitize test name for use in filename (replace / with _)
86+
sanitizedName := strings.ReplaceAll(testName, "/", "_")
87+
88+
// Create a timestamped file with test name
89+
timestamp := time.Now().Format("2006-01-02_15-04-05")
90+
filename := fmt.Sprintf("%s_%s.log", sanitizedName, timestamp)
91+
92+
// Try to create the file in the current directory
93+
file, err := os.Create(filename)
94+
if err != nil {
95+
// Fall back to a temp directory if we can't write to current dir
96+
filename = filepath.Join(os.TempDir(), filename)
97+
file, err = os.Create(filename)
98+
if err != nil {
99+
return nil, nil, "", fmt.Errorf("failed to create diagnostics file: %w", err)
100+
}
101+
}
102+
103+
return file, file, filename, nil
104+
}
105+
67106
// CollectAllDiagnostics collects and prints all diagnostic information at INFO level.
68107
// This should be called when a test fails, before cleanup runs.
69108
// All output is at INFO level to ensure visibility regardless of log level settings.
109+
//
110+
// Diagnostics are written to a timestamped file using the test name (e.g., TestRollingUpdate_2025-01-22_15-04-05.log).
111+
// Set the DiagnosticsToStdoutEnvVar environment variable to output to stdout instead.
70112
func CollectAllDiagnostics(tc TestContext) {
113+
// Get test name for the diagnostics file
114+
testName := "unknown_test"
115+
if tc.T != nil {
116+
testName = tc.T.Name()
117+
}
118+
119+
// Create diagnostics output writer
120+
writer, closer, filename, err := createDiagnosticsWriter(testName)
121+
if err != nil {
122+
logger.Errorf("Failed to create diagnostics writer, falling back to stdout: %v", err)
123+
writer = os.Stdout
124+
filename = ""
125+
}
126+
if closer != nil {
127+
defer closer.Close()
128+
}
129+
130+
// Save reference to stdout logger, then shadow with diagnostics logger
131+
stdoutLogger := logger
132+
logger := utils.NewTestLoggerWithOutput(utils.InfoLevel, writer)
133+
134+
// Log where diagnostics are being written (to main test output)
135+
if filename != "" {
136+
stdoutLogger.Infof("Writing diagnostics to file: %s", filename)
137+
}
138+
71139
logger.Info("================================================================================")
72140
logger.Info("=== COLLECTING FAILURE DIAGNOSTICS ===")
73141
logger.Info("================================================================================")
74142

75143
// Collect each type of diagnostic, continuing even if one fails
76-
dumpOperatorLogs(tc)
77-
dumpGroveResources(tc)
78-
dumpPodDetails(tc)
79-
dumpRecentEvents(tc)
144+
dumpOperatorLogs(tc, logger)
145+
dumpGroveResources(tc, logger)
146+
dumpPodDetails(tc, logger)
147+
dumpRecentEvents(tc, logger)
80148

81149
logger.Info("================================================================================")
82150
logger.Info("=== END OF FAILURE DIAGNOSTICS ===")
83151
logger.Info("================================================================================")
152+
153+
// Log completion message (to main test output)
154+
if filename != "" {
155+
stdoutLogger.Infof("Diagnostics collection complete. Output written to: %s", filename)
156+
}
84157
}
85158

86159
// dumpOperatorLogs captures and prints operator logs at INFO level.
87160
// Captures all logs from all containers in the operator pod.
88-
func dumpOperatorLogs(tc TestContext) {
161+
func dumpOperatorLogs(tc TestContext, logger *utils.Logger) {
89162
logger.Info("================================================================================")
90163
logger.Info("=== OPERATOR LOGS (all) ===")
91164
logger.Info("================================================================================")
@@ -175,7 +248,7 @@ func dumpOperatorLogs(tc TestContext) {
175248
}
176249

177250
// dumpGroveResources dumps all Grove resources as YAML at INFO level.
178-
func dumpGroveResources(tc TestContext) {
251+
func dumpGroveResources(tc TestContext, logger *utils.Logger) {
179252
logger.Info("================================================================================")
180253
logger.Info("=== GROVE RESOURCES ===")
181254
logger.Info("================================================================================")
@@ -221,7 +294,7 @@ func dumpGroveResources(tc TestContext) {
221294
// dumpPodDetails dumps detailed pod information at INFO level.
222295
// Lists ALL pods in the namespace (not filtered by workload label selector)
223296
// to ensure we capture all relevant pods during failure diagnostics.
224-
func dumpPodDetails(tc TestContext) {
297+
func dumpPodDetails(tc TestContext, logger *utils.Logger) {
225298
logger.Info("================================================================================")
226299
logger.Info("=== POD DETAILS ===")
227300
logger.Info("================================================================================")
@@ -305,7 +378,7 @@ func dumpPodDetails(tc TestContext) {
305378
}
306379

307380
// dumpRecentEvents dumps Kubernetes events from the last eventLookbackDuration at INFO level.
308-
func dumpRecentEvents(tc TestContext) {
381+
func dumpRecentEvents(tc TestContext, logger *utils.Logger) {
309382
logger.Info("================================================================================")
310383
logger.Infof("=== KUBERNETES EVENTS (last %v) ===", eventLookbackDuration)
311384
logger.Info("================================================================================")

0 commit comments

Comments
 (0)