Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion cmd/gpu-kubelet-plugin/sharing.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,26 @@ func NewTimeSlicingManager(deviceLib *deviceLib) *TimeSlicingManager {
}
}

func (t *TimeSlicingManager) SetTimeSlice(devices UUIDProvider, config *configapi.TimeSlicingConfig) error {
func (t *TimeSlicingManager) SetTimeSlice(devices *UUIDProvider, config *configapi.TimeSlicingConfig) error {
// Ensure all devices are full devices
if !slices.Equal(devices.UUIDs(), devices.GpuUUIDs()) {
return fmt.Errorf("can only set the time-slice interval on full GPUs")
}
for _, gpu := range devices.Gpu.Devices {
err, isSupportTimeSlice := detectSupportTimeSliceByCudaComputeCapability(gpu.cudaComputeCapability)
if err != nil {
return fmt.Errorf("failed to detectSupportTimeSliceByCudaComputeCapability : %w", err)
}
if !isSupportTimeSlice {
klog.InfoS("the current card does not support setting time slices and will be ignored.", "arch", gpu.architecture, "uuid", gpu.uuid, "cudaComputeCapability", gpu.cudaComputeCapability)
return fmt.Errorf("setting a TimeSlice duration on devices uuid=%v is unsupported", gpu.uuid)
}
}

timeSlice := sharing.DefaultTimeSlice
if config != nil && config.TimeSlice != nil {
timeSlice = *config.TimeSlice
}

// Set the compute mode of the GPU to DEFAULT.
err := t.nvdevlib.setComputeMode(devices.UUIDs(), "DEFAULT")
Expand Down Expand Up @@ -440,3 +455,21 @@ func getDefaultShmSize() string {
}
return fallbackSize
}

// detactSupportTimeSliceByArch Determine whether the architecture series
// supports setting time slices based on the gpu cudaComputeCapability.
func detectSupportTimeSliceByCudaComputeCapability(cudaComputeCapability string) (error, bool) {
// ref https://github.com/NVIDIA/k8s-dra-driver/pull/58#discussion_r1469338562
// we believe time-slicing is available on Volta+ architectures, so the check would simply be cudaComputeCapability >= 7.0
// by https://github.com/NVIDIA/go-nvlib/blob/main/pkg/nvlib/device/device.go#L149, We know that cuda major and minor versions are concatenated through `.` .

cudaVersion := strings.Split(cudaComputeCapability, ".")
major, err := strconv.Atoi(cudaVersion[0])
if err != nil {
return fmt.Errorf("error to get cudaComputeCapability major version %v", cudaComputeCapability), false
}
if major >= 7 {
return nil, true
}
return nil, false
}