Add MIG devices support

zamog · zamog · commit 459ebd57731b · 2021-12-15T23:14:34.000+02:00
Signed-off-by: Gilad Zamoscinski &lt;gilad.z@taboola.com&gt;
diff --git a/main.go b/main.go
@@ -29,11 +29,17 @@ import (
 	"volcano.sh/k8s-device-plugin/pkg/filewatcher"
 	"volcano.sh/k8s-device-plugin/pkg/plugin"
 	"volcano.sh/k8s-device-plugin/pkg/plugin/nvidia"
+	"github.com/NVIDIA/go-gpuallocator/gpuallocator"
 )
 
 func getAllPlugins() []plugin.DevicePlugin {
 	return []plugin.DevicePlugin{
-		nvidia.NewNvidiaDevicePlugin(),
+		nvidia.NewNvidiaDevicePlugin(
+			nvidia.VolcanoGPUResource,
+			nvidia.NewGpuDeviceManager(false),
+			nvidia.VisibleDevice,
+			gpuallocator.Policy(nil),
+			pluginapi.DevicePluginPath + "volcano.sock"),
 	}
 }
 
diff --git a/pkg/plugin/nvidia/mig-strategy.go b/pkg/plugin/nvidia/mig-strategy.go
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-package main
+package nvidia
 
 import (
 	"fmt"
diff --git a/pkg/plugin/nvidia/server.go b/pkg/plugin/nvidia/server.go
@@ -27,6 +27,7 @@ import (
 	"time"
 
 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
+	"github.com/NVIDIA/go-gpuallocator/gpuallocator"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/klog"
 
@@ -41,10 +42,12 @@ type NvidiaDevicePlugin struct {
 	socket       string
 
 	server *grpc.Server
+	deviceListEnvvar string
+	allocatePolicy   gpuallocator.Policy
 	// Physical gpu card
 	physicalDevices []*Device
 	health          chan *Device
-	stop            chan struct{}
+	stop            chan interface{}
 
 	// Virtual devices
 	virtualDevices []*pluginapi.Device
@@ -54,7 +57,7 @@ type NvidiaDevicePlugin struct {
 }
 
 // NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
-func NewNvidiaDevicePlugin() *NvidiaDevicePlugin {
+func NewNvidiaDevicePlugin(resourceName string, resourceManager ResourceManager, deviceListEnvvar string, allocatePolicy gpuallocator.Policy, socket string) *NvidiaDevicePlugin {
 	log.Println("Loading NVML")
 	if err := nvml.Init(); err != nil {
 		log.Printf("Failed to initialize NVML: %s.", err)
@@ -69,9 +72,11 @@ func NewNvidiaDevicePlugin() *NvidiaDevicePlugin {
 	}
 
 	return &NvidiaDevicePlugin{
-		ResourceManager: NewGpuDeviceManager(),
-		resourceName:    VolcanoGPUResource,
-		socket:          pluginapi.DevicePluginPath + "volcano.sock",
+		ResourceManager:  resourceManager,
+		deviceListEnvvar: deviceListEnvvar,
+		resourceName:     resourceName,
+		socket:           socket,
+		allocatePolicy:   allocatePolicy,
 		kubeInteractor:  ki,
 
 		// These will be reinitialized every
@@ -89,7 +94,7 @@ func (m *NvidiaDevicePlugin) initialize() {
 	m.physicalDevices = m.Devices()
 	m.server = grpc.NewServer([]grpc.ServerOption{}...)
 	m.health = make(chan *Device)
-	m.stop = make(chan struct{})
+	m.stop = make(chan interface{})
 
 	m.virtualDevices, m.devicesByIndex = GetDevices()
 }
@@ -122,7 +127,7 @@ func (m *NvidiaDevicePlugin) Name() string {
 func (m *NvidiaDevicePlugin) Start() error {
 	m.initialize()
 	// must be called after initialize
-	if err := m.kubeInteractor.PatchGPUResourceOnNode(len(m.physicalDevices)); err != nil {
+	if err := m.kubeInteractor.PatchGPUResourceOnNode(len(m.devicesByIndex)); err != nil {
 		log.Printf("failed to patch gpu resource: %v", err)
 		m.cleanup()
 		return fmt.Errorf("failed to patch gpu resource: %v", err)
@@ -314,7 +319,7 @@ Allocate:
 		klog.Warningf("Failed to get the gpu id for pod %s/%s", candidatePod.Namespace, candidatePod.Name)
 		return nil, fmt.Errorf("failed to find gpu id")
 	}
-	_, exist := m.GetDeviceNameByIndex(uint(id))
+	deviceName, exist := m.GetDeviceNameByIndex(uint(id))
 	if !exist {
 		klog.Warningf("Failed to find the dev for pod %s/%s because it's not able to find dev with index %d",
 			candidatePod.Namespace, candidatePod.Name, id)
@@ -325,7 +330,7 @@ Allocate:
 		reqGPU := len(req.DevicesIDs)
 		response := pluginapi.ContainerAllocateResponse{
 			Envs: map[string]string{
-				VisibleDevice:        fmt.Sprintf("%d", id),
+				VisibleDevice:        fmt.Sprintf("%s", deviceName),
 				AllocatedGPUResource: fmt.Sprintf("%d", reqGPU),
 				TotalGPUResource:     fmt.Sprintf("%d", gpuMemory),
 			},
@@ -362,3 +367,33 @@ func (m *NvidiaDevicePlugin) dial(unixSocketPath string, timeout time.Duration)
 
 	return c, nil
 }
+
+// GetPreferredAllocation returns the preferred allocation from the set of devices specified in the request
+func (m *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
+	response := &pluginapi.PreferredAllocationResponse{}
+	for _, req := range r.ContainerRequests {
+		available, err := gpuallocator.NewDevicesFrom(req.AvailableDeviceIDs)
+		if err != nil {
+			return nil, fmt.Errorf("Unable to retrieve list of available devices: %v", err)
+		}
+
+		required, err := gpuallocator.NewDevicesFrom(req.MustIncludeDeviceIDs)
+		if err != nil {
+			return nil, fmt.Errorf("Unable to retrieve list of required devices: %v", err)
+		}
+
+		allocated := m.allocatePolicy.Allocate(available, required, int(req.AllocationSize))
+
+		var deviceIds []string
+		for _, device := range allocated {
+			deviceIds = append(deviceIds, device.UUID)
+		}
+
+		resp := &pluginapi.ContainerPreferredAllocationResponse{
+			DeviceIDs: deviceIds,
+		}
+
+		response.ContainerResponses = append(response.ContainerResponses, resp)
+	}
+	return response, nil
+}
diff --git a/pkg/plugin/nvidia/utils.go b/pkg/plugin/nvidia/utils.go
@@ -63,6 +63,8 @@ func GenerateVirtualDeviceID(id uint, fakeCounter uint) string {
 
 func SetGPUMemory(raw uint) {
 	v := raw
+	// TODO Add cli flag for units
+	v = uint(math.Floor(float64(raw) / 100.0))
 	gpuMemory = v
 	log.Infof("set gpu memory: %d", gpuMemory)
 }
@@ -81,23 +83,49 @@ func GetDevices() ([]*pluginapi.Device, map[uint]string) {
 	for i := uint(0); i < n; i++ {
 		d, err := nvml.NewDevice(i)
 		check(err)
-		var id uint
-		_, err = fmt.Sscanf(d.Path, "/dev/nvidia%d", &id)
+		migEnabled, err := d.IsMigEnabled()
 		check(err)
-		deviceByIndex[id] = d.UUID
-		// TODO: Do we assume all cards are of same capacity
-		if GetGPUMemory() == uint(0) {
-			SetGPUMemory(uint(*d.Memory))
-		}
-		for j := uint(0); j < GetGPUMemory(); j++ {
-			fakeID := GenerateVirtualDeviceID(id, j)
-			virtualDevs = append(virtualDevs, &pluginapi.Device{
-				ID:     fakeID,
-				Health: pluginapi.Healthy,
-			})
+
+		var id uint
+		// TODO: Support only MigStrategySingle
+		if migEnabled {
+			migs, err := d.GetMigDevices()
+			check(err)
+			for j, mig := range migs {
+				// TODO: explain formula (based on device and mig numbers)
+				id =  i*uint(2) + i + uint(j)
+				deviceByIndex[id] = mig.UUID
+				if GetGPUMemory() == uint(0) {
+					SetGPUMemory(uint(*mig.Memory))
+				}
+				for j := uint(0); j < GetGPUMemory(); j++ {
+					fakeID := GenerateVirtualDeviceID(id, j)
+					virtualDevs = append(virtualDevs, &pluginapi.Device{
+						ID:     fakeID,
+						Health: pluginapi.Healthy,
+					})
+				}
+
+			}
+
+		} else {
+
+			_, err = fmt.Sscanf(d.Path, "/dev/nvidia%d", &id)
+			check(err)
+			deviceByIndex[id] = d.UUID
+			// TODO: Do we assume all cards are of same capacity
+			if GetGPUMemory() == uint(0) {
+				SetGPUMemory(uint(*d.Memory))
+			}
+			for j := uint(0); j < GetGPUMemory(); j++ {
+				fakeID := GenerateVirtualDeviceID(id, j)
+				virtualDevs = append(virtualDevs, &pluginapi.Device{
+					ID:     fakeID,
+					Health: pluginapi.Healthy,
+				})
+			}
 		}
 	}
-
 	return virtualDevs, deviceByIndex
 }
 

Original file line number	Diff line number	Diff line change
`@@ -29,11 +29,17 @@ import (`
`29`	`29`	`"volcano.sh/k8s-device-plugin/pkg/filewatcher"`
`30`	`30`	`"volcano.sh/k8s-device-plugin/pkg/plugin"`
`31`	`31`	`"volcano.sh/k8s-device-plugin/pkg/plugin/nvidia"`
	`32`	`+ "github.com/NVIDIA/go-gpuallocator/gpuallocator"`
`32`	`33`	`)`
`33`	`34`
`34`	`35`	`func getAllPlugins() []plugin.DevicePlugin {`
`35`	`36`	`return []plugin.DevicePlugin{`
`36`		`- nvidia.NewNvidiaDevicePlugin(),`
	`37`	`+ nvidia.NewNvidiaDevicePlugin(`
	`38`	`+ nvidia.VolcanoGPUResource,`
	`39`	`+ nvidia.NewGpuDeviceManager(false),`
	`40`	`+ nvidia.VisibleDevice,`
	`41`	`+ gpuallocator.Policy(nil),`
	`42`	`+ pluginapi.DevicePluginPath + "volcano.sock"),`
`37`	`43`	`}`
`38`	`44`	`}`
`39`	`45`