daos-stack
diff --git a/‎docs/admin/administration.md‎
Lines changed: 99 additions & 10 deletions b/‎docs/admin/administration.md‎
Lines changed: 99 additions & 10 deletions
diff --git a/‎docs/overview/fault.md‎
Lines changed: 65 additions & 0 deletions b/‎docs/overview/fault.md‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎src/control/cmd/dmg/auto_test.go‎
Lines changed: 1 addition & 0 deletions b/‎src/control/cmd/dmg/auto_test.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/control/events/ras.go‎
Lines changed: 1 addition & 0 deletions b/‎src/control/events/ras.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/control/lib/control/event.go‎
Lines changed: 3 additions & 1 deletion b/‎src/control/lib/control/event.go‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/control/lib/control/mocks.go‎
Lines changed: 9 additions & 1 deletion b/‎src/control/lib/control/mocks.go‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/control/server/config/server.go‎
Lines changed: 20 additions & 1 deletion b/‎src/control/server/config/server.go‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎src/control/server/config/server_test.go‎
Lines changed: 3 additions & 1 deletion b/‎src/control/server/config/server_test.go‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/control/server/ctl_check_test.go‎
Lines changed: 1 addition & 1 deletion b/‎src/control/server/ctl_check_test.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/control/server/ctl_ranks_rpc.go‎
Lines changed: 24 additions & 1 deletion b/‎src/control/server/ctl_ranks_rpc.go‎
Lines changed: 24 additions & 1 deletion
@@ -49,6 +49,8 @@ severity, message, description, and cause.
 | engine\_died| STATE\_CHANGE| ERROR| DAOS engine <idx\> exited exited unexpectedly: <error\> | Indicates engine instance <idx\> unexpectedly. <error> describes the exit state returned from exited daos\_engine process.| N/A                          |
 | engine\_asserted| STATE\_CHANGE| ERROR| TBD| Indicates engine instance <idx\> threw a runtime assertion, causing a crash. | An unexpected internal state resulted in assert failure. |
 | engine\_clock\_drift| INFO\_ONLY   | ERROR| clock drift detected| Indicates CART comms layer has detected clock skew between engines.| NTP may not be syncing clocks across DAOS system.      |
+| engine\_self\_terminated| INFO\_ONLY| NOTICE| excluded rank self terminated detected| Indicates that a DAOS engine rank has performed a self-termination due to having been excluded from the system's group map. The rank is automatically restarted by the control plane with rate-limiting (default: 5 minute minimum delay between restarts per rank) to prevent restart storms. | An engine was found to be in a transient non-functional state and excluded from the group map. The control plane monitors for this event and automatically restarts the affected engine so it can rejoin the system. Restarts are rate-limited per rank using the `engine_auto_restart_min_delay` configuration parameter. |
+| engine\_join\_failed| INFO\_ONLY| ERROR | DAOS engine <idx\> (rank <rank\>) was not allowed to join the system | Join operation failed for the given engine instance ID and rank (if assigned). | Reason should be provided in the extended info field of the event data. |
 | pool\_corruption\_detected| INFO\_ONLY| ERROR | Data corruption detected| Indicates a corruption in pool data has been detected. The event fields will contain pool and container UUIDs. | A corruption was found by the checksum scrubber. |
 | pool\_rebuild\_started| INFO\_ONLY| NOTICE   | Pool rebuild started.| Indicates a pool rebuild has started. The event data field contains pool map version and pool operation identifier. | When a pool rank becomes unavailable a rebuild will be triggered.   |
 | pool\_rebuild\_finished| INFO\_ONLY| NOTICE| Pool rebuild finished.| Indicates a pool rebuild has finished successfully. The event data field includes the pool map version and pool operation identifier.  | N/A|
@@ -69,7 +71,6 @@ severity, message, description, and cause.
 | device\_plugged| INFO\_ONLY| NOTICE| Detected hot plugged device: <bdev-name\> | Indicates device was physically inserted into host. | NVMe SSD physically added to host. |
 | device\_replace| INFO\_ONLY| NOTICE or ERROR| Replaced device: <uuid\> with device: <uuid\> [failed: <rc\>] | Indicates that a faulty device was replaced with a new device and if the operation failed. The old and new device IDs as well as any non-zero return code are specified in the event data. | Device was replaced using DMG nvme replace command. |
 | system\_fabric\_provider\_changed| INFO\_ONLY| NOTICE| System fabric provider has changed: <old-provider\> -> <new-provider\>| Indicates that the system-wide fabric provider has been updated. No other specific information is included in event data.| A system-wide fabric provider change has been intentionally applied to all joined ranks.|
-| engine\_join\_failed| INFO\_ONLY| ERROR | DAOS engine <idx\> (rank <rank\>) was not allowed to join the system | Join operation failed for the given engine instance ID and rank (if assigned). | Reason should be provided in the extended info field of the event data. |
 | device\_link\_speed\_changed| INFO\_ONLY| NOTICE or WARNING| NVMe PCIe device at <pci-address\> port-<idx\>: link speed changed to <transfer-rate\> (max <transfer-rate\>)| Indicates that an NVMe device link speed has changed. The negotiated and maximum device link speeds are indicated in the event message field and the severity is set to warning if the negotiated speed is not at maximum capability (and notice level severity if at maximum). No other specific information is included in the event data.| Either device link speed was previously downgraded and has returned to maximum or link speed has downgraded to a value that is less than its maximum capability.|
 | device\_link\_width\_changed| INFO\_ONLY| NOTICE or WARNING| NVMe PCIe device at <pci-address\> port-<idx\>: link width changed to <pcie-link-lanes\> (max <pcie-link-lanes\>)| Indicates that an NVMe device link width has changed. The negotiated and maximum device link widths are indicated in the event message field and the severity is set to warning if the negotiated width is not at maximum capability (and notice level severity if at maximum). No other specific information is included in the event data.| Either device link width was previously downgraded and has returned to maximum or link width has downgraded to a value that is less than its maximum capability.|
 | device\_led\_set| INFO\_ONLY| NOTICE| LED on device <device\> set to state <state\>| Indicates that the LED state has been changed on a device. Device identifier and LED state are specified in the event message.| LED control command was issued to change device LED state for visual identification or fault indication.|
@@ -1007,6 +1008,94 @@ specified on the command line:
 If the ranks were excluded from pools (e.g., unclean shutdown), they will need to
 be reintegrated. Please see the pool operation section for more information.
 
+### Engine Auto-Restart
+
+DAOS automatically restarts engines that self-terminate after being excluded from
+the system. This feature improves system availability by recovering from transient
+failures without administrator intervention.
+
+#### How It Works
+
+When an engine is excluded (e.g., due to network issues detected by SWIM), the
+engine detects the exclusion and performs a self-termination. The control plane
+monitors for these events and automatically restarts the affected engine after
+clearing the exclusion state, allowing it to rejoin the system.
+
+The automatic restart includes rate-limiting to prevent restart storms. By default,
+an engine must wait 5 minutes between automatic restarts.
+
+#### Configuration
+
+Control auto-restart behavior in `daos_server.yml`:
+
+```yaml
+# Disable automatic restart (default: enabled)
+disable_engine_auto_restart: false
+
+# Minimum delay between automatic restarts per rank (default: 300 seconds)
+engine_auto_restart_min_delay: 300
+```
+
+#### Manual Operations
+
+Manual `dmg system stop` and `dmg system start` operations are never affected by
+the rate-limiting mechanism. Administrators can always immediately stop and start
+ranks regardless of recent automatic restart activity.
+
+```bash
+# Manual operations always work immediately
+$ dmg system stop --ranks=0,1,2
+$ dmg system start --ranks=0,1,2
+```
+
+When you manually stop or start ranks, the restart history for those ranks is
+automatically cleared, ensuring no delays from previous automatic restarts.
+
+#### Monitoring
+
+The `engine_self_terminated` RAS event is logged when an engine self-terminates
+and triggers an automatic restart:
+
+```
+&&& RAS EVENT id: [engine_self_terminated] ... msg: [excluded rank self terminated detected]
+```
+
+Use `dmg system query` to check rank status and incarnation numbers. The
+incarnation number increments each time a rank restarts, helping track restart
+events:
+
+```bash
+$ dmg system query --ranks=0
+Rank UUID                                 Control Address  Fault Domain State  Reason Incarnation
+---- ----                                 --------------- ------------- -----  ------ -----------
+0    12345678-1234-1234-1234-123456789012 10.0.0.1:10001  /node1        Joined        3
+```
+
+#### Best Practices
+
+- **Leave enabled**: Automatic restart improves availability for transient failures
+- **Adjust timing**: For frequent exclusions, consider increasing `engine_auto_restart_min_delay`
+- **Monitor events**: Watch for repeated `engine_self_terminated` events indicating persistent issues
+- **Manual control**: Use `dmg system stop/start` for maintenance without worrying about delays
+
+#### Troubleshooting
+
+**Problem**: Rank keeps self-terminating and restarting
+
+**Solution**: Investigate root cause:
+1. Check network connectivity (SWIM may be detecting real failures)
+2. Review engine logs for errors
+3. Verify hardware health
+4. Consider disabling auto-restart temporarily for investigation
+
+**Problem**: Need immediate restart but recently auto-restarted
+
+**Solution**: Use manual operations (not affected by rate-limiting):
+```bash
+$ dmg system stop --ranks=X
+$ dmg system start --ranks=X
+```
+
 ### Storage Reformat
 
 To reformat the system after a controlled shutdown, run the command:
@@ -1052,15 +1141,15 @@ the storage server has not changed the old rank can be "reused" by formatting us
 
 An examples workflow would be:
 
-- `daos_server` is running and PMem NVDIMM fails causing an engine to enter excluded state.
-- `daos_server` is stopped, storage server powered down, faulty PMem NVDIMM is replaced.
-- After powering up storage server, `daos_server scm prepare` command is used to repair PMem.
-- Storage server is rebooted after running `daos_server scm prepare` and command is run again.
-- Now PMem is intact, clear with `wipefs -a /dev/pmemX` where "X" refers to the repaired PMem ID.
-- `daos_server` can be started again. On start-up repaired engine prompts for "SCM format required".
-- Run `dmg storage format --replace` to rejoin with existing rank (if --replace isn't used, a new
-  rank will be created).
-- Formatted engine will join using the existing (old) rank which is mapped to the engine's hardware.
+1. `daos_server` is running and PMem NVDIMM fails causing an engine to enter excluded state.
+2. `daos_server` is stopped, storage server powered down, faulty PMem NVDIMM is replaced.
+3. After powering up storage server, `daos_server scm prepare` command is used to repair PMem.
+4. Storage server is rebooted after running `daos_server scm prepare` and command is run again.
+5. Now PMem is intact, clear with `wipefs -a /dev/pmemX` where "X" refers to the repaired PMem ID.
+6. `daos_server` can be started again. On start-up repaired engine prompts for "SCM format required".
+7. Run `dmg storage format --replace` to rejoin with existing rank (if --replace isn't used, a new
+   rank will be created).
+8. Formatted engine will join using the existing (old) rank which is mapped to the engine's hardware.
 
 !!! note
     `dmg storage format --replace` can be used to replace a rank in `AdminExcluded` state. The
 
@@ -84,3 +84,68 @@ can now read from the rebuilt object shards.
 
 This rebuild process is executed online while applications continue accessing
 and updating objects.
+
+### Engine Self-Termination and Automatic Restart
+
+A DAOS engine may be excluded from the group map because of inactivity
+for example. When an engine becomes aware of it's removal from the
+group map it will self-terminate to protect data integrity and system stability.
+
+When an engine self terminates, it raises a `engine_self_terminated` RAS event
+(INFO_ONLY, NOTICE severity) containing the rank and incarnation information.
+The control plane automatically handles this event by:
+
+1. Detecting the engine self terminated event through the RAS event system
+2. Identifying the engine instance associated with the rank
+3. Waiting for the engine process to fully stop
+4. Automatically restarting the engine to rejoin the system
+
+This automatic restart mechanism is implemented in all control servers to ensure
+local engine recovery happens regardless of management service leadership state.
+The restarted engine will rejoin the system with a new incarnation number and
+resume normal operations.
+
+This self-healing mechanism allows DAOS to automatically recover system
+membership state from transient engine failures without administrator
+intervention, improving overall system availability.
+
+#### Rate Limiting
+
+To prevent restart storms and ensure system stability, automatic engine restarts
+are rate-limited on a per-rank basis. By default, a minimum delay of 300 seconds
+(5 minutes) is enforced between consecutive restart attempts for the same rank.
+
+When an engine self-terminates within the minimum delay period, the control plane
+schedules a deferred restart that will automatically trigger when the delay expires.
+If multiple self-termination events occur for the same rank during the delay period
+(this would be unexpected) only the most recent event triggers a deferred restart.
+This ensures the engine is restarted exactly once after the delay, regardless of
+how many self-termination events occur.
+
+The rate-limiting interval can be customized by setting the
+`engine_auto_restart_min_delay` configuration option (in seconds) in the
+daos_server.yml file. For example:
+
+```yaml
+engine_auto_restart_min_delay: 600  # 10 minutes between restarts
+```
+
+This protection mechanism prevents scenarios where:
+- Repeated transient failures cause excessive restart cycling
+- A misconfigured engine continuously self-terminates
+- Cascading failures overwhelm the control plane with restart requests
+
+#### Disabling Automatic Restart
+
+The automatic restart behavior can be completely disabled by setting the
+`disable_engine_auto_restart` configuration option to `true` in the
+daos_server.yml file:
+
+```yaml
+disable_engine_auto_restart: true
+```
+
+When auto restart is disabled, engines that self-terminate will not be
+automatically restarted by the control plane, requiring manual intervention
+to restart the affected engine instances. This setting may be useful for
+debugging scenarios or when custom external restart management is preferred.
@@ -606,6 +606,7 @@ mgmt_svc_replicas:
 - hostX:10002
 fault_cb: ""
 hyperthreads: false
+disable_engine_auto_restart: false
 `
 	)
 
 
@@ -49,6 +49,7 @@ const (
 	RASUnknownEvent            RASID = C.RAS_UNKNOWN_EVENT
 	RASEngineFormatRequired    RASID = C.RAS_ENGINE_FORMAT_REQUIRED     // notice
 	RASEngineDied              RASID = C.RAS_ENGINE_DIED                // error
+	RASEngineSelfTerminated    RASID = C.RAS_ENGINE_SELF_TERMINATED     // notice
 	RASPoolRepsUpdate          RASID = C.RAS_POOL_REPS_UPDATE           // info
 	RASSwimRankAlive           RASID = C.RAS_SWIM_RANK_ALIVE            // info
 	RASSwimRankDead            RASID = C.RAS_SWIM_RANK_DEAD             // info
 
@@ -1,5 +1,6 @@
 //
 // (C) Copyright 2021-2024 Intel Corporation.
+// (C) Copyright 2026 Hewlett Packard Enterprise Development LP
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -170,7 +171,8 @@ func newEventLogger(logBasic logging.Logger, newSyslogger newSysloggerFn) *Event
 }
 
 // NewEventLogger returns an initialized EventLogger capable of writing to the
-// supplied logger in addition to syslog.
+// supplied logger in addition to syslog. Should only be used in production code,
+// use MockEventLogger in unit tests.
 func NewEventLogger(log logging.Logger) *EventLogger {
 	return newEventLogger(log, syslog.NewLogger)
 }
@@ -1,6 +1,6 @@
 //
 // (C) Copyright 2020-2024 Intel Corporation.
-// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
+// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -30,6 +30,7 @@ import (
 	"github.com/daos-stack/daos/src/control/common/test"
 	"github.com/daos-stack/daos/src/control/lib/hostlist"
 	"github.com/daos-stack/daos/src/control/lib/ranklist"
+	"github.com/daos-stack/daos/src/control/logging"
 	"github.com/daos-stack/daos/src/control/server/config"
 	"github.com/daos-stack/daos/src/control/server/engine"
 	"github.com/daos-stack/daos/src/control/server/storage"
@@ -945,3 +946,10 @@ func MockHostFabricMap(t *testing.T, scans ...*MockFabricScan) HostFabricMap {
 
 	return hfm
 }
+
+// MockEventLogger returns EventLogger reference that has no syslog handlers registered.
+func MockEventLogger(logBasic logging.Logger) *EventLogger {
+	return &EventLogger{
+		log: logBasic,
+	}
+}
@@ -98,7 +98,9 @@ type Server struct {
 	Path string `yaml:"-"` // path to config file
 
 	// Behavior flags
-	AutoFormat bool `yaml:"-"`
+	AutoFormat                bool `yaml:"-"`
+	DisableEngineAutoRestart  bool `yaml:"disable_engine_auto_restart"`
+	EngineAutoRestartMinDelay int  `yaml:"engine_auto_restart_min_delay,omitempty"`
 
 	deprecatedParams `yaml:",inline"`
 }
@@ -355,6 +357,18 @@ func (cfg *Server) WithTelemetryPort(port int) *Server {
 	return cfg
 }
 
+// WithDisableEngineAutoRestart enables or disables automatic engine restarts on self-termination.
+func (cfg *Server) WithDisableEngineAutoRestart(disabled bool) *Server {
+	cfg.DisableEngineAutoRestart = disabled
+	return cfg
+}
+
+// WithEngineAutoRestartMinDelay sets minimum time between automatic engine restarts.
+func (cfg *Server) WithEngineAutoRestartMinDelay(secs uint) *Server {
+	cfg.EngineAutoRestartMinDelay = int(secs)
+	return cfg
+}
+
 // DefaultServer creates a new instance of configuration struct
 // populated with defaults.
 func DefaultServer() *Server {
@@ -830,6 +844,11 @@ func (cfg *Server) Validate(log logging.Logger) (err error) {
 		return FaultConfigSysRsvdZero
 	}
 
+	if cfg.EngineAutoRestartMinDelay < 0 {
+		return errors.Errorf("engine_auto_restart_min_delay must be >= 0 (got %d)",
+			cfg.EngineAutoRestartMinDelay)
+	}
+
 	// A config without engines is valid when initially discovering hardware prior to adding
 	// per-engine sections with device allocations.
 	if len(cfg.Engines) == 0 {
 
@@ -266,7 +266,9 @@ func TestServerConfig_Constructed(t *testing.T) {
 		WithHyperthreads(true). // hyper-threads disabled by default
 		WithSystemRamReserved(5).
 		WithAllowNumaImbalance(true).
-		WithAllowTHP(true)
+		WithAllowTHP(true).
+		WithDisableEngineAutoRestart(true).
+		WithEngineAutoRestartMinDelay(120)
 
 	// add engines explicitly to test functionality applied in WithEngines()
 	constructed.Engines = []*engine.Config{
 
@@ -116,7 +116,7 @@ func TestServer_ControlService_CheckEngineRepair(t *testing.T) {
 					t.Fatalf("setup error - wrong type for Engine (%T)", e)
 				}
 
-				setupTestEngine(t, srv, uint32(i), rankNums[i])
+				setupTestEngine(t, srv, rankNums[i])
 
 				drpcCfg := new(mockDrpcClientConfig)
 				drpcCfg.ConnectError = tc.drpcErr
 
@@ -1,6 +1,6 @@
 //
 // (C) Copyright 2020-2024 Intel Corporation.
-// (C) Copyright 2025 Hewlett Packard Enterprise Development LP
+// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -153,6 +153,21 @@ func (svc *ControlService) memberStateResults(instances []Engine, tgtState syste
 	return results, nil
 }
 
+// Clear restart history for manually stopped ranks on this server. This prevents rate-limiting
+// from interfering with manual operations and vice versa.
+func clearRankRestartHistory(mgr *engineRestartManager, instances []Engine) {
+	ranks := make([]ranklist.Rank, 0, len(instances))
+	for _, ei := range instances {
+		rank, err := ei.GetRank()
+		if err == nil {
+			ranks = append(ranks, rank)
+		}
+	}
+	if len(ranks) > 0 {
+		mgr.clearRankRestartHistory(ranks)
+	}
+}
+
 // StopRanks implements the method defined for the Management Service.
 //
 // Stop data-plane instance(s) managed by control-plane identified by unique
@@ -206,6 +221,10 @@ func (svc *ControlService) StopRanks(ctx context.Context, req *ctlpb.RanksReq) (
 		return nil, err
 	}
 
+	// clear state history for stopped ranks, instances have already been filtered by
+	// FilterInstancesByRankSet() to match req.GetRanks()
+	clearRankRestartHistory(svc.restartMgr, instances)
+
 	return resp, nil
 }
 
@@ -319,6 +338,10 @@ func (svc *ControlService) StartRanks(ctx context.Context, req *ctlpb.RanksReq)
 		return nil, err
 	}
 
+	// clear state history for started ranks, instances have already been filtered by
+	// FilterInstancesByRankSet() to match req.GetRanks()
+	clearRankRestartHistory(svc.restartMgr, instances)
+
 	return resp, nil
 }
Original file line number	Diff line number	Diff line change
`@@ -606,6 +606,7 @@ mgmt_svc_replicas:`
`606`	`606`	`- hostX:10002`
`607`	`607`	`fault_cb: ""`
`608`	`608`	`hyperthreads: false`
	`609`	`+disable_engine_auto_restart: false`
`609`	`610`	`
`610`	`611`	`)`
`611`	`612`
Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ func TestServer_ControlService_CheckEngineRepair(t *testing.T) {`
`116`	`116`	`t.Fatalf("setup error - wrong type for Engine (%T)", e)`
`117`	`117`	`}`
`118`	`118`
`119`		`- setupTestEngine(t, srv, uint32(i), rankNums[i])`
	`119`	`+ setupTestEngine(t, srv, rankNums[i])`
`120`	`120`
`121`	`121`	`drpcCfg := new(mockDrpcClientConfig)`
`122`	`122`	`drpcCfg.ConnectError = tc.drpcErr`