Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/27177.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
scheduler (Enterprise): Fixed a bug where tasks were not placed on same numa node as reserved device
```
1 change: 1 addition & 0 deletions command/agent/job_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -1639,6 +1639,7 @@ func ApiResourcesToStructs(in *api.Resources) *structs.Resources {
if in.NUMA != nil {
out.NUMA = &structs.NUMA{
Affinity: in.NUMA.Affinity,
Devices: slices.Clone(in.NUMA.Devices),
}
}

Expand Down
37 changes: 16 additions & 21 deletions scheduler/feasible/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,25 +120,32 @@ func (d *deviceAllocator) createOffer(mem *memoryNodeMatcher, ask *structs.Reque
// Determine the devices that are feasible based on availability and
// constraints
for id, devInst := range d.Devices {
// Check if the device works
if !nodeDeviceMatches(d.ctx, devInst.Device, ask) {
continue
}

// Check if we have enough unused instances to use this
assignable := uint64(0)
assignable := []string{}
for instanceID, v := range devInst.Instances {
if v != 0 {
continue
}
if !mem.Matches(instanceID, devInst.Device) {
continue
}
assignable++
}
if d.deviceIDMatchesConstraint(instanceID, ask.Constraints, devInst.Device) {
assignable = append(assignable, instanceID)
}

// This device doesn't have enough instances
if assignable < ask.Count {
continue
// Don't assign more than the ask
if len(assignable) == int(ask.Count) {
break
}
}

// Check if the device works
if !nodeDeviceMatches(d.ctx, devInst.Device, ask) {
// This device doesn't have enough instances
if len(assignable) < int(ask.Count) {
continue
}

Expand Down Expand Up @@ -185,19 +192,7 @@ func (d *deviceAllocator) createOffer(mem *memoryNodeMatcher, ask *structs.Reque
Vendor: id.Vendor,
Type: id.Type,
Name: id.Name,
DeviceIDs: make([]string, 0, ask.Count),
}

assigned := uint64(0)
for id, v := range devInst.Instances {
if v == 0 && assigned < ask.Count &&
d.deviceIDMatchesConstraint(id, ask.Constraints, devInst.Device) {
assigned++
offer.DeviceIDs = append(offer.DeviceIDs, id)
if assigned == ask.Count {
break
}
}
DeviceIDs: assignable,
}
}

Expand Down
52 changes: 50 additions & 2 deletions scheduler/feasible/device_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/hashicorp/go-set/v3"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/lib/numalib"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
Expand Down Expand Up @@ -215,7 +216,7 @@ func TestDeviceAllocator_Allocate_NUMA_node1(t *testing.T) {
}

// Test that asking for a device with constraints works
func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
func TestDeviceAllocate_Constraints_NoMemoryMatch(t *testing.T) {
ci.Parallel(t)

n := multipleNvidiaNode()
Expand Down Expand Up @@ -349,8 +350,55 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
}
}

func TestDeviceAllocate_Constraints_MemoryMatch(t *testing.T) {
ci.Parallel(t)

n := multipleNvidiaNode()
nvidia0 := n.NodeResources.Devices[0]

_, ctx := MockContext(t)
d := newDeviceAllocator(ctx, n)
must.NotNil(t, d)

testContraints := []*structs.Constraint{
{
LTarget: "${device.ids}",
Operand: "set_contains",
RTarget: nvidia0.Instances[1].ID,
},
}
// Build the request
ask := deviceRequest("nvidia/gpu", 1, testContraints, nil)

mem := &memoryNodeMatcher{
memoryNode: 1,
topology: &numalib.Topology{
BusAssociativity: map[string]hw.NodeID{
nvidia0.Instances[0].Locality.PciBusID: 1,
nvidia0.Instances[1].Locality.PciBusID: 2,
},
},
devices: set.From([]string{nvidia0.ID().String()}),
}
out, _, err := d.createOffer(mem, ask)

// the first memoryNodeMatcher does not have the correct memoryNode
must.ErrorContains(t, err, "no devices match")
must.Nil(t, out)

// change to the correct node
mem.memoryNode = 2
out, _, err = d.createOffer(mem, ask)

must.NoError(t, err)
must.Len(t, 1, out.DeviceIDs)
must.SliceContains(t, collectInstanceIDs(nvidia0), out.DeviceIDs[0])
must.SliceContains(t, out.DeviceIDs, nvidia0.Instances[1].ID)

}

// Test that asking for a device with affinities works
func TestDeviceAllocator_Allocate_Affinities(t *testing.T) {
func TestDeviceAllocator_Affinities(t *testing.T) {
ci.Parallel(t)

n := multipleNvidiaNode()
Expand Down