Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/e2e-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,13 @@ jobs:
skaffold version

- name: Run e2e tests - ${{ matrix.test_name }}
env:
# Override diagnostics for CI: write to /tmp for artifact upload
GROVE_E2E_DIAG_MODE: "file"
GROVE_E2E_DIAG_DIR: "/tmp"
run: |
cd operator
echo "> Preparing charts (copying CRDs)..."
./hack/prepare-charts.sh
echo "> Running e2e tests for ${{ matrix.test_name }}..."
cd e2e && go test -tags=e2e ./tests/... -v -timeout 45m -run '${{ matrix.test_pattern }}'
make test-e2e TEST_PATTERN='${{ matrix.test_pattern }}'

# The test code handles cleanup via Teardown(), but this step provides
# extra safety in case of timeout or panic. Also good practice to ensure
Expand Down
15 changes: 11 additions & 4 deletions operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,18 @@ cover-html: test-cover
# Run e2e tests
# Usage: make test-e2e [TEST_PATTERN=<pattern>]
# Examples:
# make test-e2e # Run all tests
# make test-e2e TEST_PATTERN=Test_GS # Run all gang scheduling tests
# make test-e2e TEST_PATTERN=Test_GS1 # Run specific test
# make test-e2e TEST_PATTERN=Test_TAS # Run all topology tests
# make test-e2e # Run all tests
# make test-e2e TEST_PATTERN=Test_GS # Run gang scheduling tests
# make test-e2e TEST_PATTERN=Test_GS1 # Run specific test
# make test-e2e TEST_PATTERN=Test_TAS # Run topology tests
# GROVE_E2E_DIAG_MODE=both make test-e2e # Output to stdout and file
# GROVE_E2E_DIAG_MODE=stdout make test-e2e # Output to stdout only
#
# Environment variables:
# GROVE_E2E_DIAG_MODE - Diagnostic output mode: stdout, file, both (default: file)
# GROVE_E2E_DIAG_DIR - Directory for diagnostic files (default: operator/)
.PHONY: test-e2e
test-e2e: export GROVE_E2E_DIAG_DIR = $(MODULE_ROOT)
test-e2e:
@echo "> Preparing charts (copying CRDs)..."
@$(MODULE_HACK_DIR)/prepare-charts.sh
Expand Down
197 changes: 181 additions & 16 deletions operator/e2e/tests/debug_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,17 @@
package tests

import (
"errors"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strings"
"time"

"github.com/ai-dynamo/grove/operator/e2e/setup"
"github.com/ai-dynamo/grove/operator/e2e/utils"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand All @@ -35,12 +40,21 @@ const (
// logBufferSize is the size of the buffer for reading logs from the operator (debugging purposes)
logBufferSize = 64 * 1024 // 64KB

// operatorLogLines is the number of log lines to capture from the operator.
// Set to 2000 to ensure we capture logs from before the failure occurred,
// not just the steady-state logs after the failure.
operatorLogLines = 2000
// eventLookbackDuration is how far back to look for events
eventLookbackDuration = 10 * time.Minute

// DiagnosticsModeEnvVar controls diagnostic output mode.
// Values: "stdout", "file", "both" (default: "file")
DiagnosticsModeEnvVar = "GROVE_E2E_DIAG_MODE"

// DiagnosticsDirEnvVar specifies the directory for diagnostic files.
// Used when mode is "file" or "both". Default: current directory (fallback to /tmp).
DiagnosticsDirEnvVar = "GROVE_E2E_DIAG_DIR"

// Diagnostics mode constants
DiagnosticsModeStdout = "stdout"
DiagnosticsModeFile = "file"
DiagnosticsModeBoth = "both"
)

// isPodReady checks if a pod is ready
Expand Down Expand Up @@ -68,30 +82,183 @@ var groveResourceTypes = []groveResourceType{
{"PodGangs", schema.GroupVersionResource{Group: "scheduler.grove.io", Version: "v1alpha1", Resource: "podgangs"}, "PODGANG"},
}

// nopCloser wraps an io.Writer to implement io.WriteCloser with a no-op Close.
// Used for os.Stdout which we don't want to actually close.
type nopCloser struct {
io.Writer
}

func (nopCloser) Close() error { return nil }

// multiWriteCloser wraps multiple io.WriteCloser instances.
// Write calls write to all; Close closes all.
type multiWriteCloser struct {
writers []io.WriteCloser
}

func newMultiWriteCloser(writers ...io.WriteCloser) *multiWriteCloser {
return &multiWriteCloser{writers: writers}
}

func (m *multiWriteCloser) Write(p []byte) (n int, err error) {
for _, w := range m.writers {
n, err = w.Write(p)
if err != nil {
return n, err
}
}
return len(p), nil
}

func (m *multiWriteCloser) Close() error {
var errs []error
for _, w := range m.writers {
if err := w.Close(); err != nil {
errs = append(errs, err)
}
}
return errors.Join(errs...)
}

// createDiagnosticsOutput creates writer(s) for diagnostics output based on GROVE_E2E_DIAG_MODE.
// Mode values:
// - "stdout": write to stdout only
// - "file": write to timestamped file only (default)
// - "both": write to both stdout and file
//
// When writing to file, GROVE_E2E_DIAG_DIR specifies the directory (default: current dir, fallback to /tmp).
// Returns the writer, filename (empty if stdout-only), and any error.
func createDiagnosticsOutput(testName string) (io.WriteCloser, string, error) {
mode := os.Getenv(DiagnosticsModeEnvVar)
if mode == "" {
mode = DiagnosticsModeFile // default
}

var writers []io.WriteCloser
var filename string

// Add stdout if mode is stdout or both
if mode == DiagnosticsModeStdout || mode == DiagnosticsModeBoth {
writers = append(writers, nopCloser{os.Stdout})
}

// Add file if mode is file or both
if mode == DiagnosticsModeFile || mode == DiagnosticsModeBoth {
file, name, err := createDiagnosticsFile(testName)
if err != nil {
// If we can't create a file but stdout is available, continue with stdout only
if mode == DiagnosticsModeBoth {
logger.Infof("[DIAG] Warning: failed to create diagnostics file: %v (continuing with stdout only)", err)
} else {
return nil, "", err
}
} else {
filename = name
writers = append(writers, file)
}
}

if len(writers) == 0 {
return nil, "", fmt.Errorf("no valid diagnostics output configured (mode=%s)", mode)
}

return newMultiWriteCloser(writers...), filename, nil
}

// createDiagnosticsFile creates a timestamped diagnostics file
func createDiagnosticsFile(testName string) (*os.File, string, error) {
// Sanitize test name for use in filename (replace / with _)
sanitizedName := strings.ReplaceAll(testName, "/", "_")

// Create a timestamped file with test name
timestamp := time.Now().Format("2006-01-02_15-04-05")
baseFilename := fmt.Sprintf("e2e-diag-%s_%s.log", sanitizedName, timestamp)

// Determine the directory for the diagnostics file
diagDir := os.Getenv(DiagnosticsDirEnvVar)
if diagDir != "" {
filename := filepath.Join(diagDir, baseFilename)
file, err := os.Create(filename)
if err != nil {
return nil, "", fmt.Errorf("failed to create diagnostics file in %s: %w", diagDir, err)
}
return file, filename, nil
}

// Try to create the file in the current directory
file, err := os.Create(baseFilename)
if err != nil {
// Fall back to a temp directory if we can't write to current dir
filename := filepath.Join(os.TempDir(), baseFilename)
file, err = os.Create(filename)
if err != nil {
return nil, "", fmt.Errorf("failed to create diagnostics file: %w", err)
}
return file, filename, nil
}

return file, baseFilename, nil
}

// CollectAllDiagnostics collects and prints all diagnostic information at INFO level.
// This should be called when a test fails, before cleanup runs.
// All output is at INFO level to ensure visibility regardless of log level settings.
//
// Output is controlled by GROVE_E2E_DIAG_MODE env var:
// - "stdout": print to stdout only
// - "file": write to timestamped file only (default)
// - "both": write to both stdout and file
//
// Set GROVE_E2E_DIAG_DIR env var to specify the output directory for diagnostics files.
func CollectAllDiagnostics(tc TestContext) {
// Get test name for the diagnostics file
testName := "unknown_test"
if tc.T != nil {
testName = tc.T.Name()
}

// Create diagnostics output
output, filename, err := createDiagnosticsOutput(testName)
if err != nil {
logger.Errorf("Failed to create diagnostics output, falling back to stdout: %v", err)
output = nopCloser{os.Stdout}
}
defer output.Close()

// Save reference to stdout logger, then shadow with diagnostics logger
stdoutLogger := logger
logger := utils.NewTestLoggerWithOutput(utils.InfoLevel, output)

// Log where diagnostics are being written (to main test output)
if filename != "" {
stdoutLogger.Infof("Writing diagnostics to file: %s", filename)
}

logger.Info("================================================================================")
logger.Info("=== COLLECTING FAILURE DIAGNOSTICS ===")
logger.Info("================================================================================")

// Collect each type of diagnostic, continuing even if one fails
dumpOperatorLogs(tc)
dumpGroveResources(tc)
dumpPodDetails(tc)
dumpRecentEvents(tc)
dumpOperatorLogs(tc, logger)
dumpGroveResources(tc, logger)
dumpPodDetails(tc, logger)
dumpRecentEvents(tc, logger)

logger.Info("================================================================================")
logger.Info("=== END OF FAILURE DIAGNOSTICS ===")
logger.Info("================================================================================")

// Log completion message (to main test output)
if filename != "" {
stdoutLogger.Infof("Diagnostics collection complete. Output written to: %s", filename)
}
}

// dumpOperatorLogs captures and prints operator logs at INFO level.
// Captures the last operatorLogLines lines from all containers in the operator pod.
func dumpOperatorLogs(tc TestContext) {
// Captures all logs from all containers in the operator pod.
func dumpOperatorLogs(tc TestContext, logger *utils.Logger) {
logger.Info("================================================================================")
logger.Infof("=== OPERATOR LOGS (last %d lines) ===", operatorLogLines)
logger.Info("=== OPERATOR LOGS (all) ===")
logger.Info("================================================================================")

// List pods in the operator namespace
Expand Down Expand Up @@ -141,10 +308,8 @@ func dumpOperatorLogs(tc TestContext) {
for _, container := range pod.Spec.Containers {
logger.Infof("--- Container: %s Logs ---", container.Name)

tailLines := int64(operatorLogLines)
req := tc.Clientset.CoreV1().Pods(setup.OperatorNamespace).GetLogs(pod.Name, &corev1.PodLogOptions{
Container: container.Name,
TailLines: &tailLines,
})

logStream, err := req.Stream(tc.Ctx)
Expand Down Expand Up @@ -181,7 +346,7 @@ func dumpOperatorLogs(tc TestContext) {
}

// dumpGroveResources dumps all Grove resources as YAML at INFO level.
func dumpGroveResources(tc TestContext) {
func dumpGroveResources(tc TestContext, logger *utils.Logger) {
logger.Info("================================================================================")
logger.Info("=== GROVE RESOURCES ===")
logger.Info("================================================================================")
Expand Down Expand Up @@ -227,7 +392,7 @@ func dumpGroveResources(tc TestContext) {
// dumpPodDetails dumps detailed pod information at INFO level.
// Lists ALL pods in the namespace (not filtered by workload label selector)
// to ensure we capture all relevant pods during failure diagnostics.
func dumpPodDetails(tc TestContext) {
func dumpPodDetails(tc TestContext, logger *utils.Logger) {
logger.Info("================================================================================")
logger.Info("=== POD DETAILS ===")
logger.Info("================================================================================")
Expand Down Expand Up @@ -311,7 +476,7 @@ func dumpPodDetails(tc TestContext) {
}

// dumpRecentEvents dumps Kubernetes events from the last eventLookbackDuration at INFO level.
func dumpRecentEvents(tc TestContext) {
func dumpRecentEvents(tc TestContext, logger *utils.Logger) {
logger.Info("================================================================================")
logger.Infof("=== KUBERNETES EVENTS (last %v) ===", eventLookbackDuration)
logger.Info("================================================================================")
Expand Down
Loading