@@ -62,6 +62,9 @@ type SpectrumXManager interface {
6262 GetDocaCCTargetVersion (device * v1alpha1.NicDevice ) (string , error )
6363 // RunDocaSpcXCC launches and keeps track of the DOCA SPC-X CC process for the given port
6464 RunDocaSpcXCC (port v1alpha1.NicDevicePortSpec ) error
65+ // GetCCTerminationChannel returns a read-only channel that receives the RDMA interface name
66+ // when a DOCA SPC-X CC process terminates unexpectedly after startup
67+ GetCCTerminationChannel () <- chan string
6568}
6669
6770type spectrumXConfigManager struct {
@@ -70,14 +73,16 @@ type spectrumXConfigManager struct {
7073 execInterface execUtils.Interface
7174 nvConfigUtils nvconfig.NVConfigUtils
7275
73- ccProcesses map [string ]* ccProcess
76+ ccProcesses map [string ]* ccProcess
77+ ccTerminationChan chan string // buffered; carries RDMA iface name on unexpected exit
7478}
7579
7680type ccProcess struct {
7781 port v1alpha1.NicDevicePortSpec
7882 cmd execUtils.Cmd
7983
80- running atomic.Bool
84+ running atomic.Bool
85+ startupCheckPassed atomic.Bool // set after the 3s startup window; distinguishes startup failures from runtime crashes
8186
8287 // Error handling with mutex protection
8388 errMutex sync.RWMutex
@@ -704,6 +709,16 @@ func (m *spectrumXConfigManager) RunDocaSpcXCC(port v1alpha1.NicDevicePortSpec)
704709
705710 log .Log .V (2 ).Info ("SpectrumXConfigManager.RunDocaSpcXCC(): CC process output" , "rdma" , port .RdmaInterface , "output" , string (output ))
706711 process .running .Store (false )
712+
713+ // Notify controller only for runtime crashes (after startup check passed)
714+ if process .startupCheckPassed .Load () {
715+ log .Log .Info ("SpectrumXConfigManager.RunDocaSpcXCC(): CC process terminated unexpectedly, sending notification" , "rdma" , port .RdmaInterface )
716+ select {
717+ case m .ccTerminationChan <- port .RdmaInterface :
718+ default :
719+ log .Log .V (2 ).Info ("SpectrumXConfigManager.RunDocaSpcXCC(): termination channel full, notification dropped" , "rdma" , port .RdmaInterface )
720+ }
721+ }
707722 }()
708723
709724 log .Log .V (2 ).Info ("Waiting 3s for DOCA SPC-X CC to start" , "rdma" , port .RdmaInterface )
@@ -723,19 +738,27 @@ func (m *spectrumXConfigManager) RunDocaSpcXCC(port v1alpha1.NicDevicePortSpec)
723738
724739 log .Log .V (2 ).Info ("DOCA SPC-X CC process started" , "rdma" , port .RdmaInterface )
725740
741+ process .startupCheckPassed .Store (true )
726742 m .ccProcesses [port .RdmaInterface ] = process
727743
728744 log .Log .Info ("Started DOCA SPC-X CC process" , "rdma" , port .RdmaInterface )
729745
730746 return nil
731747}
732748
749+ // GetCCTerminationChannel returns a read-only channel for CC process termination notifications.
750+ // The channel carries the RDMA interface name of the terminated CC process.
751+ func (m * spectrumXConfigManager ) GetCCTerminationChannel () <- chan string {
752+ return m .ccTerminationChan
753+ }
754+
733755func NewSpectrumXConfigManager (dmsManager dms.DMSManager , spectrumXConfigs map [string ]* types.SpectrumXConfig ) SpectrumXManager {
734756 return & spectrumXConfigManager {
735- dmsManager : dmsManager ,
736- spectrumXConfigs : spectrumXConfigs ,
737- execInterface : execUtils .New (),
738- nvConfigUtils : nvconfig .NewNVConfigUtils (),
739- ccProcesses : make (map [string ]* ccProcess ),
757+ dmsManager : dmsManager ,
758+ spectrumXConfigs : spectrumXConfigs ,
759+ execInterface : execUtils .New (),
760+ nvConfigUtils : nvconfig .NewNVConfigUtils (),
761+ ccProcesses : make (map [string ]* ccProcess ),
762+ ccTerminationChan : make (chan string , 10 ),
740763 }
741764}
0 commit comments