Skip to content

Commit 3551599

Browse files
Merge pull request #167 from almaslennikov/spc-x-fixes
fix: different fixes for SPC-X support
2 parents 67d5373 + ff5a709 commit 3551599

File tree

13 files changed

+91
-56
lines changed

13 files changed

+91
-56
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ FROM nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
4646
ARG TARGETARCH
4747
ENV MFT_VERSION=4.29.0-131
4848

49-
ARG PACKAGES="dpkg-dev=1.21.1ubuntu2.3 mstflint=4.21.0+1-1ubuntu0.1~22.04.1"
49+
ARG PACKAGES="dpkg-dev=1.21.1ubuntu2.6 mstflint=4.21.0+1-1ubuntu0.1~22.04.1"
5050

5151
# enable deb-src repos
5252
RUN sed -i 's/^# deb-src/deb-src/g' /etc/apt/sources.list /etc/apt/sources.list.d/*

Dockerfile.nic-configuration-daemon

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ FROM nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
2929
ARG TARGETARCH
3030
ENV MFT_VERSION=4.32.0-120
3131

32-
ARG PACKAGES="dpkg-dev=1.21.1ubuntu2.3 libusb-1.0-0=2:1.0.25-1ubuntu2 ipmitool=1.8.18-11ubuntu2.2 rshim curl=7.81.0-1ubuntu1.20 systemd-sysv=249.11-0ubuntu3.16 mstflint=4.21.0+1-1ubuntu0.1~22.04.1"
32+
ARG PACKAGES="dpkg-dev=1.21.1ubuntu2.6 libusb-1.0-0=2:1.0.25-1ubuntu2 ipmitool=1.8.18-11ubuntu2.2 rshim curl=7.81.0-1ubuntu1.21 systemd-sysv=249.11-0ubuntu3.16 mstflint=4.21.0+1-1ubuntu0.1~22.04.1"
3333

3434
# enable deb-src repos
3535
RUN sed -i 's/^# deb-src/deb-src/g' /etc/apt/sources.list /etc/apt/sources.list.d/*

bindata/spectrum-x/RA2.0.yaml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ runtimeConfig:
5353
value: dscp
5454
dmsPath: /interfaces/interface/nvidia/qos/config/trust-mode
5555
valueType: string
56+
alternativeValue: QOS_TRUST_MODE_DSCP
5657
- name: PFC
5758
value: "00010000"
5859
dmsPath: /interfaces/interface/nvidia/qos/config/pfc
@@ -64,23 +65,23 @@ runtimeConfig:
6465
adaptiveRouting:
6566
- name: Adaptive Retransmission
6667
value: true
67-
dmsPath: /nvidia/roce/config/adaptive-retransmission
68+
dmsPath: /interfaces/interface/nvidia/roce/config/adaptive-retransmission
6869
valueType: bool
6970
- name: Tx Window
7071
value: true
71-
dmsPath: /nvidia/roce/config/tx-window
72+
dmsPath: /interfaces/interface/nvidia/roce/config/tx-window
7273
valueType: bool
7374
- name: Slow Restart
7475
value: false
75-
dmsPath: /nvidia/roce/config/slow-restart
76+
dmsPath: /interfaces/interface/nvidia/roce/config/slow-restart
7677
valueType: bool
7778
- name: Slow Restart Idle
7879
value: false
79-
dmsPath: /nvidia/roce/config/slow-restart-idle
80+
dmsPath: /interfaces/interface/nvidia/roce/config/slow-restart-idle
8081
valueType: bool
8182
- name: Adaptive Routing Force
8283
value: true
83-
dmsPath: /nvidia/roce/config/adaptive-routing-force
84+
dmsPath: /interfaces/interface/nvidia/roce/config/adaptive-routing-force
8485
valueType: bool
8586
congestionControl:
8687
- name: Congestion Control on RP points

cmd/nic-configuration-daemon/main.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ import (
2121
"fmt"
2222
"maps"
2323
"os"
24+
"path/filepath"
2425
"slices"
26+
"strings"
2527

2628
maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1"
2729
"k8s.io/apimachinery/pkg/runtime"
@@ -156,6 +158,7 @@ func main() {
156158
MaintenanceManager: maintenanceManager,
157159
FirmwareManager: firmwareManager,
158160
EventRecorder: eventRecorder,
161+
SpectrumXManager: spectrumXConfigManager,
159162
HostUtils: hostUtils,
160163
}
161164
err = nicDeviceReconciler.SetupWithManager(mgr, true)
@@ -190,8 +193,9 @@ func initNicFwMap(namespace string) error {
190193
}
191194

192195
func initSpectrumXConfigs() (map[string]*types.SpectrumXConfig, error) {
196+
log.Log.V(2).Info("initSpectrumXConfigs(): reading spectrum-x configs")
193197
spectrumXConfigs := make(map[string]*types.SpectrumXConfig)
194-
entries, err := os.ReadDir("bindata/spectrum-x")
198+
entries, err := os.ReadDir("/bindata/spectrum-x")
195199
if err != nil {
196200
return nil, fmt.Errorf("failed to read spectrum-x directory: %w", err)
197201
}
@@ -200,11 +204,15 @@ func initSpectrumXConfigs() (map[string]*types.SpectrumXConfig, error) {
200204
continue
201205
}
202206

203-
config, err := types.LoadSpectrumXConfig("bindata/spectrum-x/" + file.Name())
207+
log.Log.V(2).Info("initSpectrumXConfigs(): loading spectrum-x config", "file", file.Name())
208+
config, err := types.LoadSpectrumXConfig("/bindata/spectrum-x/" + file.Name())
204209
if err != nil {
205210
return nil, fmt.Errorf("failed to load spectrum-x config: %w", err)
206211
}
207-
spectrumXConfigs[file.Name()] = config
212+
213+
configName := strings.TrimSuffix(file.Name(), filepath.Ext(file.Name()))
214+
spectrumXConfigs[configName] = config
215+
log.Log.V(2).Info("initSpectrumXConfigs(): added spectrum-x config", "configName", configName)
208216
}
209217

210218
return spectrumXConfigs, nil

pkg/configuration/configvalidation.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ func (v *configValidationImpl) RuntimeConfigApplied(device *v1alpha1.NicDevice)
288288
}
289289

290290
// Don't validate QoS settings if neither trust nor pfc changes are requested
291-
if desiredQoSSpec != nil && desiredQoSSpec.Trust == "" && desiredQoSSpec.PFC == "" && desiredQoSSpec.ToS == 0 {
291+
if desiredQoSSpec == nil || (desiredQoSSpec.Trust == "" && desiredQoSSpec.PFC == "" && desiredQoSSpec.ToS == 0) {
292292
return true, nil
293293
}
294294

@@ -338,7 +338,7 @@ func (v *configValidationImpl) CalculateDesiredRuntimeConfig(device *v1alpha1.Ni
338338
if template.RoceOptimized != nil && template.RoceOptimized.Enabled {
339339
trust := "dscp"
340340
pfc := "0,0,0,1,0,0,0,0"
341-
tos := 96
341+
tos := 0
342342

343343
if template.RoceOptimized.Qos != nil {
344344
trust = template.RoceOptimized.Qos.Trust

pkg/configuration/configvalidation_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,7 @@ var _ = Describe("ConfigValidationImpl", func() {
704704
Expect(qos).ToNot(BeNil())
705705
Expect(qos.Trust).To(Equal("dscp"))
706706
Expect(qos.PFC).To(Equal("0,0,0,1,0,0,0,0"))
707-
Expect(qos.ToS).To(Equal(96))
707+
Expect(qos.ToS).To(Equal(0))
708708
})
709709

710710
It("should prioritize RoceOptimized settings over defaults when both optimizations are enabled", func() {

pkg/configuration/manager.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,18 @@ func (h configurationManager) ApplyDeviceRuntimeSpec(device *v1alpha1.NicDevice)
279279
return err
280280
}
281281
}
282+
283+
spectrumXConfigApplied, err = h.spectrumXConfigManager.RuntimeConfigApplied(device)
284+
if err != nil {
285+
log.Log.Error(err, "failed to verify spectrumx runtime configuration", "device", device.Name)
286+
return err
287+
}
288+
289+
if !spectrumXConfigApplied {
290+
err = fmt.Errorf("spectrumx runtime config failed to apply")
291+
log.Log.Error(err, "device", device.Name)
292+
return err
293+
}
282294
}
283295

284296
if alreadyApplied {

pkg/devicediscovery/devicediscovery.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ func (d deviceDiscovery) DiscoverNicDevices() (map[string]v1alpha1.NicDeviceStat
7878

7979
vpd, err := d.utils.GetVPD(device.Address)
8080
if err != nil {
81-
log.Log.Error(err, "Failed to get device's part and serial numbers", "address", device.Address)
82-
return nil, err
81+
log.Log.Error(err, "Failed to get device's part and serial numbers, skipping", "address", device.Address)
82+
continue
8383
}
8484

8585
// Devices with the same serial number are ports of the same NIC, so grouping them

pkg/devicediscovery/devicediscovery_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,14 @@ var _ = Describe("DeviceDiscovery", func() {
134134
mockUtils.AssertExpectations(GinkgoT())
135135
})
136136

137-
It("should fails if GetPartAndSerialNumber fails", func() {
137+
It("should not fail if GetPartAndSerialNumber fails", func() {
138138
mockUtils.On("IsSriovVF", "0000:00:00.0").Return(false)
139139
mockUtils.On("GetVPD", "0000:00:00.0").
140140
Return(nil, errors.New("serial number error"))
141141

142142
devices, err := manager.DiscoverNicDevices()
143-
Expect(err).To(HaveOccurred())
144-
Expect(devices).To(BeNil())
143+
Expect(err).NotTo(HaveOccurred())
144+
Expect(devices).To(BeEmpty())
145145
mockUtils.AssertExpectations(GinkgoT())
146146
})
147147

@@ -259,7 +259,7 @@ var _ = Describe("DeviceDiscovery", func() {
259259
mockUtils.AssertExpectations(GinkgoT())
260260
})
261261

262-
It("should log and skip only a faulty device if GetPartAndSerialNumber fails", func() {
262+
It("should log and skip only a faulty device if GetVPD fails", func() {
263263
mockUtils.On("IsSriovVF", "0000:00:00.0").
264264
Return(false)
265265
mockUtils.On("GetVPD", "0000:00:00.0").
@@ -277,8 +277,8 @@ var _ = Describe("DeviceDiscovery", func() {
277277
Return(nil, errors.New("serial number error"))
278278

279279
devices, err := manager.DiscoverNicDevices()
280-
Expect(err).To(HaveOccurred())
281-
Expect(devices).To(BeNil())
280+
Expect(err).NotTo(HaveOccurred())
281+
Expect(devices).To(HaveLen(1))
282282
mockUtils.AssertExpectations(GinkgoT())
283283
})
284284

pkg/dms/client.go

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -298,13 +298,15 @@ func (i *dmsInstance) SetQoSSettings(spec *v1alpha1.QosSpec) error {
298298
}
299299
log.Log.V(2).Info("PFC configuration set successfully", "device", i.device.SerialNumber, "interface", port.NetworkInterface)
300300

301-
log.Log.V(2).Info("Setting ToS configuration", "device", i.device.SerialNumber, "port", idx+1, "interface", port.NetworkInterface)
302-
err = i.RunSetPathCommand(ToSPath, fmt.Sprintf("%d", spec.ToS), ValueTypeInt, interfaceNameFilter(port.NetworkInterface))
303-
if err != nil {
304-
log.Log.V(2).Error(err, "Failed to set ToS configuration", "device", i.device.SerialNumber, "interface", port.NetworkInterface)
305-
return fmt.Errorf("failed to set ToS configuration: %v", err)
301+
if spec.ToS != 0 {
302+
log.Log.V(2).Info("Setting ToS configuration", "device", i.device.SerialNumber, "port", idx+1, "interface", port.NetworkInterface)
303+
err = i.RunSetPathCommand(ToSPath, fmt.Sprintf("%d", spec.ToS), ValueTypeInt, interfaceNameFilter(port.NetworkInterface))
304+
if err != nil {
305+
log.Log.V(2).Error(err, "Failed to set ToS configuration", "device", i.device.SerialNumber, "interface", port.NetworkInterface)
306+
return fmt.Errorf("failed to set ToS configuration: %v", err)
307+
}
308+
log.Log.V(2).Info("ToS configuration set successfully", "device", i.device.SerialNumber, "interface", port.NetworkInterface)
306309
}
307-
log.Log.V(2).Info("ToS configuration set successfully", "device", i.device.SerialNumber, "interface", port.NetworkInterface)
308310
}
309311

310312
log.Log.V(2).Info("QoS settings applied to all ports", "device", i.device.SerialNumber, "portCount", portCount)
@@ -401,12 +403,13 @@ func (i *dmsInstance) GetParameters(params []types.ConfigurationParameter) (map[
401403
}
402404
}
403405

404-
}
405-
406-
value, err := i.RunGetPathCommand(param.DMSPath, nil)
407-
if err != nil {
408-
log.Log.V(2).Error(err, "Failed to get parameter", "device", i.device.SerialNumber, "param", param)
409-
return nil, fmt.Errorf("failed to get parameter: %v", err)
406+
} else {
407+
var err error
408+
value, err = i.RunGetPathCommand(param.DMSPath, nil)
409+
if err != nil {
410+
log.Log.V(2).Error(err, "Failed to get parameter", "device", i.device.SerialNumber, "param", param)
411+
return nil, fmt.Errorf("failed to get parameter: %v", err)
412+
}
410413
}
411414

412415
values[param.DMSPath] = value
@@ -443,11 +446,11 @@ func (i *dmsInstance) SetParameters(params []types.ConfigurationParameter) error
443446
}
444447
}
445448
}
446-
}
447-
448-
err := i.RunSetPathCommand(param.DMSPath, param.Value, param.ValueType, nil)
449-
if err != nil {
450-
return err
449+
} else {
450+
err := i.RunSetPathCommand(param.DMSPath, param.Value, param.ValueType, nil)
451+
if err != nil {
452+
return err
453+
}
451454
}
452455
}
453456

0 commit comments

Comments
 (0)