diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index 41c6e4ac7..66c48462d 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -81,12 +81,13 @@ jobs: skaffold version - name: Run e2e tests - ${{ matrix.test_name }} + env: + # Override diagnostics for CI: write to /tmp for artifact upload + GROVE_E2E_DIAG_MODE: "file" + GROVE_E2E_DIAG_DIR: "/tmp" run: | cd operator - echo "> Preparing charts (copying CRDs)..." - ./hack/prepare-charts.sh - echo "> Running e2e tests for ${{ matrix.test_name }}..." - cd e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}' + make test-e2e TEST_PATTERN='${{ matrix.test_pattern }}' # The test code handles cleanup via Teardown(), but this step provides # extra safety in case of timeout or panic. Also good practice to ensure diff --git a/operator/Makefile b/operator/Makefile index e1ca6d8ab..fdeb5df0f 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -89,11 +89,18 @@ cover-html: test-cover # Run e2e tests # Usage: make test-e2e [TEST_PATTERN=] # Examples: -# make test-e2e # Run all tests -# make test-e2e TEST_PATTERN=Test_GS # Run all gang scheduling tests -# make test-e2e TEST_PATTERN=Test_GS1 # Run specific test -# make test-e2e TEST_PATTERN=Test_TAS # Run all topology tests +# make test-e2e # Run all tests +# make test-e2e TEST_PATTERN=Test_GS # Run gang scheduling tests +# make test-e2e TEST_PATTERN=Test_GS1 # Run specific test +# make test-e2e TEST_PATTERN=Test_TAS # Run topology tests +# GROVE_E2E_DIAG_MODE=both make test-e2e # Output to stdout and file +# GROVE_E2E_DIAG_MODE=stdout make test-e2e # Output to stdout only +# +# Environment variables: +# GROVE_E2E_DIAG_MODE - Diagnostic output mode: stdout, file, both (default: file) +# GROVE_E2E_DIAG_DIR - Directory for diagnostic files (default: operator/) .PHONY: test-e2e +test-e2e: export GROVE_E2E_DIAG_DIR = $(MODULE_ROOT) test-e2e: @echo "> Preparing charts (copying CRDs)..." @$(MODULE_HACK_DIR)/prepare-charts.sh diff --git a/operator/e2e/tests/debug_utils.go b/operator/e2e/tests/debug_utils.go index e249143f5..261755ef6 100644 --- a/operator/e2e/tests/debug_utils.go +++ b/operator/e2e/tests/debug_utils.go @@ -19,12 +19,17 @@ package tests import ( + "errors" "fmt" + "io" + "os" + "path/filepath" "sort" "strings" "time" "github.com/ai-dynamo/grove/operator/e2e/setup" + "github.com/ai-dynamo/grove/operator/e2e/utils" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" @@ -35,12 +40,21 @@ const ( // logBufferSize is the size of the buffer for reading logs from the operator (debugging purposes) logBufferSize = 64 * 1024 // 64KB - // operatorLogLines is the number of log lines to capture from the operator. - // Set to 2000 to ensure we capture logs from before the failure occurred, - // not just the steady-state logs after the failure. - operatorLogLines = 2000 // eventLookbackDuration is how far back to look for events eventLookbackDuration = 10 * time.Minute + + // DiagnosticsModeEnvVar controls diagnostic output mode. + // Values: "stdout", "file", "both" (default: "file") + DiagnosticsModeEnvVar = "GROVE_E2E_DIAG_MODE" + + // DiagnosticsDirEnvVar specifies the directory for diagnostic files. + // Used when mode is "file" or "both". Default: current directory (fallback to /tmp). + DiagnosticsDirEnvVar = "GROVE_E2E_DIAG_DIR" + + // Diagnostics mode constants + DiagnosticsModeStdout = "stdout" + DiagnosticsModeFile = "file" + DiagnosticsModeBoth = "both" ) // isPodReady checks if a pod is ready @@ -68,30 +82,183 @@ var groveResourceTypes = []groveResourceType{ {"PodGangs", schema.GroupVersionResource{Group: "scheduler.grove.io", Version: "v1alpha1", Resource: "podgangs"}, "PODGANG"}, } +// nopCloser wraps an io.Writer to implement io.WriteCloser with a no-op Close. +// Used for os.Stdout which we don't want to actually close. +type nopCloser struct { + io.Writer +} + +func (nopCloser) Close() error { return nil } + +// multiWriteCloser wraps multiple io.WriteCloser instances. +// Write calls write to all; Close closes all. +type multiWriteCloser struct { + writers []io.WriteCloser +} + +func newMultiWriteCloser(writers ...io.WriteCloser) *multiWriteCloser { + return &multiWriteCloser{writers: writers} +} + +func (m *multiWriteCloser) Write(p []byte) (n int, err error) { + for _, w := range m.writers { + n, err = w.Write(p) + if err != nil { + return n, err + } + } + return len(p), nil +} + +func (m *multiWriteCloser) Close() error { + var errs []error + for _, w := range m.writers { + if err := w.Close(); err != nil { + errs = append(errs, err) + } + } + return errors.Join(errs...) +} + +// createDiagnosticsOutput creates writer(s) for diagnostics output based on GROVE_E2E_DIAG_MODE. +// Mode values: +// - "stdout": write to stdout only +// - "file": write to timestamped file only (default) +// - "both": write to both stdout and file +// +// When writing to file, GROVE_E2E_DIAG_DIR specifies the directory (default: current dir, fallback to /tmp). +// Returns the writer, filename (empty if stdout-only), and any error. +func createDiagnosticsOutput(testName string) (io.WriteCloser, string, error) { + mode := os.Getenv(DiagnosticsModeEnvVar) + if mode == "" { + mode = DiagnosticsModeFile // default + } + + var writers []io.WriteCloser + var filename string + + // Add stdout if mode is stdout or both + if mode == DiagnosticsModeStdout || mode == DiagnosticsModeBoth { + writers = append(writers, nopCloser{os.Stdout}) + } + + // Add file if mode is file or both + if mode == DiagnosticsModeFile || mode == DiagnosticsModeBoth { + file, name, err := createDiagnosticsFile(testName) + if err != nil { + // If we can't create a file but stdout is available, continue with stdout only + if mode == DiagnosticsModeBoth { + logger.Infof("[DIAG] Warning: failed to create diagnostics file: %v (continuing with stdout only)", err) + } else { + return nil, "", err + } + } else { + filename = name + writers = append(writers, file) + } + } + + if len(writers) == 0 { + return nil, "", fmt.Errorf("no valid diagnostics output configured (mode=%s)", mode) + } + + return newMultiWriteCloser(writers...), filename, nil +} + +// createDiagnosticsFile creates a timestamped diagnostics file +func createDiagnosticsFile(testName string) (*os.File, string, error) { + // Sanitize test name for use in filename (replace / with _) + sanitizedName := strings.ReplaceAll(testName, "/", "_") + + // Create a timestamped file with test name + timestamp := time.Now().Format("2006-01-02_15-04-05") + baseFilename := fmt.Sprintf("e2e-diag-%s_%s.log", sanitizedName, timestamp) + + // Determine the directory for the diagnostics file + diagDir := os.Getenv(DiagnosticsDirEnvVar) + if diagDir != "" { + filename := filepath.Join(diagDir, baseFilename) + file, err := os.Create(filename) + if err != nil { + return nil, "", fmt.Errorf("failed to create diagnostics file in %s: %w", diagDir, err) + } + return file, filename, nil + } + + // Try to create the file in the current directory + file, err := os.Create(baseFilename) + if err != nil { + // Fall back to a temp directory if we can't write to current dir + filename := filepath.Join(os.TempDir(), baseFilename) + file, err = os.Create(filename) + if err != nil { + return nil, "", fmt.Errorf("failed to create diagnostics file: %w", err) + } + return file, filename, nil + } + + return file, baseFilename, nil +} + // CollectAllDiagnostics collects and prints all diagnostic information at INFO level. // This should be called when a test fails, before cleanup runs. // All output is at INFO level to ensure visibility regardless of log level settings. +// +// Output is controlled by GROVE_E2E_DIAG_MODE env var: +// - "stdout": print to stdout only +// - "file": write to timestamped file only (default) +// - "both": write to both stdout and file +// +// Set GROVE_E2E_DIAG_DIR env var to specify the output directory for diagnostics files. func CollectAllDiagnostics(tc TestContext) { + // Get test name for the diagnostics file + testName := "unknown_test" + if tc.T != nil { + testName = tc.T.Name() + } + + // Create diagnostics output + output, filename, err := createDiagnosticsOutput(testName) + if err != nil { + logger.Errorf("Failed to create diagnostics output, falling back to stdout: %v", err) + output = nopCloser{os.Stdout} + } + defer output.Close() + + // Save reference to stdout logger, then shadow with diagnostics logger + stdoutLogger := logger + logger := utils.NewTestLoggerWithOutput(utils.InfoLevel, output) + + // Log where diagnostics are being written (to main test output) + if filename != "" { + stdoutLogger.Infof("Writing diagnostics to file: %s", filename) + } + logger.Info("================================================================================") logger.Info("=== COLLECTING FAILURE DIAGNOSTICS ===") logger.Info("================================================================================") // Collect each type of diagnostic, continuing even if one fails - dumpOperatorLogs(tc) - dumpGroveResources(tc) - dumpPodDetails(tc) - dumpRecentEvents(tc) + dumpOperatorLogs(tc, logger) + dumpGroveResources(tc, logger) + dumpPodDetails(tc, logger) + dumpRecentEvents(tc, logger) logger.Info("================================================================================") logger.Info("=== END OF FAILURE DIAGNOSTICS ===") logger.Info("================================================================================") + + // Log completion message (to main test output) + if filename != "" { + stdoutLogger.Infof("Diagnostics collection complete. Output written to: %s", filename) + } } // dumpOperatorLogs captures and prints operator logs at INFO level. -// Captures the last operatorLogLines lines from all containers in the operator pod. -func dumpOperatorLogs(tc TestContext) { +// Captures all logs from all containers in the operator pod. +func dumpOperatorLogs(tc TestContext, logger *utils.Logger) { logger.Info("================================================================================") - logger.Infof("=== OPERATOR LOGS (last %d lines) ===", operatorLogLines) + logger.Info("=== OPERATOR LOGS (all) ===") logger.Info("================================================================================") // List pods in the operator namespace @@ -141,10 +308,8 @@ func dumpOperatorLogs(tc TestContext) { for _, container := range pod.Spec.Containers { logger.Infof("--- Container: %s Logs ---", container.Name) - tailLines := int64(operatorLogLines) req := tc.Clientset.CoreV1().Pods(setup.OperatorNamespace).GetLogs(pod.Name, &corev1.PodLogOptions{ Container: container.Name, - TailLines: &tailLines, }) logStream, err := req.Stream(tc.Ctx) @@ -181,7 +346,7 @@ func dumpOperatorLogs(tc TestContext) { } // dumpGroveResources dumps all Grove resources as YAML at INFO level. -func dumpGroveResources(tc TestContext) { +func dumpGroveResources(tc TestContext, logger *utils.Logger) { logger.Info("================================================================================") logger.Info("=== GROVE RESOURCES ===") logger.Info("================================================================================") @@ -227,7 +392,7 @@ func dumpGroveResources(tc TestContext) { // dumpPodDetails dumps detailed pod information at INFO level. // Lists ALL pods in the namespace (not filtered by workload label selector) // to ensure we capture all relevant pods during failure diagnostics. -func dumpPodDetails(tc TestContext) { +func dumpPodDetails(tc TestContext, logger *utils.Logger) { logger.Info("================================================================================") logger.Info("=== POD DETAILS ===") logger.Info("================================================================================") @@ -311,7 +476,7 @@ func dumpPodDetails(tc TestContext) { } // dumpRecentEvents dumps Kubernetes events from the last eventLookbackDuration at INFO level. -func dumpRecentEvents(tc TestContext) { +func dumpRecentEvents(tc TestContext, logger *utils.Logger) { logger.Info("================================================================================") logger.Infof("=== KUBERNETES EVENTS (last %v) ===", eventLookbackDuration) logger.Info("================================================================================")