diff --git a/pkg/worker/gpu_info_test.go b/pkg/worker/gpu_info_test.go index e79a308b6..eba83116f 100644 --- a/pkg/worker/gpu_info_test.go +++ b/pkg/worker/gpu_info_test.go @@ -2,7 +2,6 @@ package worker import ( "errors" - "os" "testing" "github.com/stretchr/testify/assert" @@ -175,33 +174,6 @@ func TestAvailableGPUDevicesReturnsQueryErrors(t *testing.T) { assert.Nil(t, devices) } -func TestResolveVisibleDevicesUsesChildProcess(t *testing.T) { - origResolve := resolveVisibleDevices - defer func() { resolveVisibleDevices = origResolve }() - - resolveVisibleDevices = func() string { - return "GPU-04612b44-abcd-1234-5678-aabbccddeeff" - } - - result := resolveVisibleDevices() - assert.Equal(t, "GPU-04612b44-abcd-1234-5678-aabbccddeeff", result) -} - -func TestResolveVisibleDevicesFallsBackToEnv(t *testing.T) { - origResolve := resolveVisibleDevices - defer func() { resolveVisibleDevices = origResolve }() - - os.Setenv("NVIDIA_VISIBLE_DEVICES", "GPU-fallback-uuid") - defer os.Unsetenv("NVIDIA_VISIBLE_DEVICES") - - resolveVisibleDevices = func() string { - return os.Getenv("NVIDIA_VISIBLE_DEVICES") - } - - result := resolveVisibleDevices() - assert.Equal(t, "GPU-fallback-uuid", result) -} - func TestAvailableGPUDevicesSingleGPUUUID(t *testing.T) { cleanup := withMockDevices(eightGPUOutput, true) defer cleanup() diff --git a/pkg/worker/gpu_integration_test.go b/pkg/worker/gpu_integration_test.go new file mode 100644 index 000000000..55d9e9564 --- /dev/null +++ b/pkg/worker/gpu_integration_test.go @@ -0,0 +1,102 @@ +package worker + +import ( + "fmt" + "os" + "strings" + "testing" + + common "github.com/beam-cloud/beta9/pkg/common" + "gvisor.dev/gvisor/pkg/sync" +) + +func TestIntegrationGPUIsolation(t *testing.T) { + if os.Getenv("GPU_INTEGRATION") != "1" { + t.Skip("set GPU_INTEGRATION=1 to run on a real GPU node") + } + + // Step 1: Read PID 1's env (what os.Getenv sees — the broken path) + pid1Env := "(unknown)" + data, err := os.ReadFile("/proc/1/environ") + if err == nil { + for _, entry := range strings.Split(string(data), "\x00") { + if strings.HasPrefix(entry, "NVIDIA_VISIBLE_DEVICES=") { + pid1Env = strings.TrimPrefix(entry, "NVIDIA_VISIBLE_DEVICES=") + break + } + } + } + t.Logf("PID 1 NVIDIA_VISIBLE_DEVICES = %q", pid1Env) + + // Step 2: Call the REAL resolveVisibleDevices() from gpu_info.go + resolved := resolveVisibleDevices() + t.Logf("resolveVisibleDevices() = %q", resolved) + + if resolved == "void" || resolved == "" { + t.Fatalf("resolveVisibleDevices() returned %q — void bug NOT fixed", resolved) + } + if !strings.HasPrefix(resolved, "GPU-") { + t.Fatalf("resolveVisibleDevices() returned %q — expected GPU UUID", resolved) + } + + // Step 3: Create the REAL NvidiaInfoClient with the resolved value (same as NewContainerNvidiaManager) + client := &NvidiaInfoClient{visibleDevices: resolved} + + // Step 4: Call the REAL AvailableGPUDevices() + devices, err := client.AvailableGPUDevices() + if err != nil { + t.Fatalf("AvailableGPUDevices() error: %v", err) + } + t.Logf("AvailableGPUDevices() = %v", devices) + + if len(devices) == 0 { + t.Fatal("AvailableGPUDevices() returned empty — GPU not visible") + } + if len(devices) != 1 { + t.Fatalf("AvailableGPUDevices() returned %d GPUs, expected exactly 1 for per-worker isolation", len(devices)) + } + + // Step 5: Verify the OLD path (void) would have failed + oldClient := &NvidiaInfoClient{visibleDevices: pid1Env} + oldDevices, err := oldClient.AvailableGPUDevices() + if err != nil { + t.Logf("Old path error (expected): %v", err) + } + t.Logf("Old path (PID 1 env=%q) -> AvailableGPUDevices() = %v", pid1Env, oldDevices) + + if pid1Env == "void" && len(oldDevices) > 0 { + t.Error("Old code path with void should return empty, but got devices — test logic wrong") + } + + // Step 6: Exercise the REAL ContainerNvidiaManager.AssignGPUDevices (chooseDevices) + manager := &ContainerNvidiaManager{ + gpuAllocationMap: common.NewSafeMap[[]int](), + gpuCount: 1, + mu: sync.Mutex{}, + infoClient: client, + resolvedVisibleDevices: resolved, + } + + assigned, err := manager.AssignGPUDevices("test-container-1", 1) + if err != nil { + t.Fatalf("AssignGPUDevices() failed: %v", err) + } + t.Logf("AssignGPUDevices(\"test-container-1\", 1) = %v", assigned) + + if len(assigned) != 1 { + t.Fatalf("Expected 1 assigned GPU, got %d", len(assigned)) + } + if assigned[0] != devices[0] { + t.Fatalf("Assigned GPU %d doesn't match available GPU %d", assigned[0], devices[0]) + } + + // Step 7: Verify second allocation to same worker FAILS (only 1 GPU available) + _, err = manager.AssignGPUDevices("test-container-2", 1) + if err == nil { + t.Fatal("Second allocation should fail — only 1 GPU per worker") + } + t.Logf("Second allocation correctly failed: %v", err) + + fmt.Printf("\nRESULT: resolved=%s gpu_index=%d old_path_would_fail=%v PASS\n", + resolved, assigned[0], pid1Env == "void" && len(oldDevices) == 0) +} diff --git a/pkg/worker/nvidia.go b/pkg/worker/nvidia.go index 44adaff76..729d249c0 100644 --- a/pkg/worker/nvidia.go +++ b/pkg/worker/nvidia.go @@ -32,11 +32,12 @@ type GPUManager interface { } type ContainerNvidiaManager struct { - gpuAllocationMap *common.SafeMap[[]int] - gpuCount uint32 - mu sync.Mutex - statFunc func(path string, stat *syscall.Stat_t) (err error) - infoClient GPUInfoClient + gpuAllocationMap *common.SafeMap[[]int] + gpuCount uint32 + mu sync.Mutex + statFunc func(path string, stat *syscall.Stat_t) (err error) + infoClient GPUInfoClient + resolvedVisibleDevices string } func NewContainerNvidiaManager(gpuCount uint32) GPUManager { @@ -51,11 +52,12 @@ func NewContainerNvidiaManager(gpuCount uint32) GPUManager { log.Info().Str("resolved_visible_devices", visibleDevices).Msg("resolved NVIDIA_VISIBLE_DEVICES for GPU filtering") return &ContainerNvidiaManager{ - gpuAllocationMap: common.NewSafeMap[[]int](), - gpuCount: gpuCount, - mu: sync.Mutex{}, - statFunc: syscall.Stat, - infoClient: &NvidiaInfoClient{visibleDevices: visibleDevices}, + gpuAllocationMap: common.NewSafeMap[[]int](), + gpuCount: gpuCount, + mu: sync.Mutex{}, + statFunc: syscall.Stat, + infoClient: &NvidiaInfoClient{visibleDevices: visibleDevices}, + resolvedVisibleDevices: visibleDevices, } } @@ -114,7 +116,7 @@ func (c *ContainerNvidiaManager) chooseDevices(containerId string, requestedGpuC // Check if we managed to allocate the requested number of GPUs if len(allocableDevices) < int(requestedGpuCount) { return nil, fmt.Errorf("not enough GPUs available: requested=%d, allocable=%d, visible=%d, configured=%d, already_allocated=%d, NVIDIA_VISIBLE_DEVICES=%q", - requestedGpuCount, len(allocableDevices), len(availableDevices), c.gpuCount, len(currentAllocations), c.infoClient.(*NvidiaInfoClient).visibleDevices) + requestedGpuCount, len(allocableDevices), len(availableDevices), c.gpuCount, len(currentAllocations), c.resolvedVisibleDevices) } // Allocate the requested number of GPUs