66 api "github.com/chaos-mesh/chaos-mesh/api/v1alpha1"
77 "github.com/kurtosis-tech/stacktrace"
88 log "github.com/sirupsen/logrus"
9+ v1 "k8s.io/api/core/v1"
910 "reflect"
1011 "sigs.k8s.io/controller-runtime/pkg/client"
1112 "time"
@@ -23,38 +24,73 @@ const (
2324 Error FaultStatus = "Error"
2425)
2526
27+ var FaultHasNoDurationErr = fmt .Errorf ("this fault has no expected duration" )
28+
2629// succeeded (inject worked, now back to normal)
2730// failure?
2831// time out?
2932
3033type FaultSession struct {
31- client * ChaosClient
32- faultKind * api.ChaosKind
33- faultSpec map [string ]interface {}
34- Name string
35- podsFailingRecovery map [string ]* api.Record
36- TestStartTime time.Time
37- TestDuration * time.Duration
38- TestEndTime time.Time
34+ client * ChaosClient
35+ faultKind * api.ChaosKind
36+ faultType string
37+ faultAction string
38+ faultSpec map [string ]interface {}
39+ Name string
40+ podsFailingRecovery map [string ]* api.Record
41+ checkedForMissingPods bool
42+ podsExpectedMissing int
43+ TestStartTime time.Time
44+ TestDuration * time.Duration
45+ TestEndTime * time.Time
46+ TargetSelectionCompleted bool
3947}
4048
4149func NewFaultSession (ctx context.Context , client * ChaosClient , faultKind * api.ChaosKind , faultSpec map [string ]interface {}, name string ) (* FaultSession , error ) {
4250 now := time .Now ()
4351
52+ faultKindStr , ok := faultSpec ["kind" ].(string )
53+ if ! ok {
54+ return nil , stacktrace .NewError ("failed to decode faultSpec.kind to string: %s" , faultSpec ["kind" ])
55+ }
56+
57+ spec , ok := faultSpec ["spec" ].(map [string ]interface {})
58+ if ! ok {
59+ return nil , stacktrace .NewError ("failed to decode faultSpec.spec to map[string]interface{}" )
60+ }
61+
62+ faultAction , ok := spec ["action" ].(string )
63+ if ! ok {
64+ return nil , stacktrace .NewError ("failed to decode faultSpec.spec.action to string: %s" , spec ["action" ])
65+ }
66+
4467 partial := & FaultSession {
45- client : client ,
46- faultKind : faultKind ,
47- faultSpec : faultSpec ,
48- Name : name ,
49- podsFailingRecovery : map [string ]* api.Record {},
50- TestStartTime : now ,
68+ client : client ,
69+ faultKind : faultKind ,
70+ faultType : faultKindStr ,
71+ faultSpec : spec ,
72+ faultAction : faultAction ,
73+ Name : name ,
74+ podsFailingRecovery : map [string ]* api.Record {},
75+ TestStartTime : now ,
76+ podsExpectedMissing : 0 ,
77+ checkedForMissingPods : false ,
78+ TargetSelectionCompleted : false ,
5179 }
5280 duration , err := partial .getDuration (ctx )
5381 if err != nil {
54- return nil , err
82+ if err == FaultHasNoDurationErr {
83+ partial .TestDuration = nil
84+ partial .TestEndTime = nil
85+ } else {
86+ return nil , err
87+ }
88+ } else {
89+ partial .TestDuration = duration
90+ endTime := now .Add (* duration )
91+ partial .TestEndTime = & endTime
5592 }
56- partial .TestDuration = duration
57- partial .TestEndTime = now .Add (* duration )
93+
5894 return partial , nil
5995}
6096
@@ -72,12 +108,39 @@ func (f *FaultSession) getKubeResource(ctx context.Context) (client.Object, erro
72108 return resource , nil
73109}
74110
75- func (f * FaultSession ) getDetailedStatus (ctx context.Context ) ([]* api.Record , error ) {
111+ func (f * FaultSession ) checkTargetSelectionCompleted (resource client.Object ) error {
112+ if f .TargetSelectionCompleted {
113+ return nil
114+ }
115+ conditionsVal := reflect .ValueOf (resource ).Elem ().FieldByName ("Status" ).FieldByName ("ChaosStatus" ).FieldByName ("Conditions" )
116+ conditions , ok := conditionsVal .Interface ().([]api.ChaosCondition )
117+ if ! ok || conditions == nil {
118+ return stacktrace .NewError ("Unable to decode status.chaosstatus.conditions" )
119+ }
120+ for _ , condition := range conditions {
121+ if condition .Type != api .ConditionSelected {
122+ continue
123+ }
124+ if condition .Status == v1 .ConditionTrue {
125+ log .Info ("chaos-mesh has identified pods to inject into" )
126+ f .TargetSelectionCompleted = true
127+ }
128+ break
129+ }
130+ return nil
131+ }
132+
133+ func (f * FaultSession ) getFaultRecords (ctx context.Context ) ([]* api.Record , error ) {
76134 resource , err := f .getKubeResource (ctx )
77135 if err != nil {
78136 return nil , err
79137 }
80138
139+ err = f .checkTargetSelectionCompleted (resource )
140+ if err != nil {
141+ return nil , err
142+ }
143+
81144 // Feel free to figure out a better way to do this. These fields are part of every Chaos status struct we support,
82145 // but since they don't implement a common interface containing the status fields, there's no clean or simple way
83146 // to extract the values in Go. One alternate option may be to serialize to json, then deserialize into an object
@@ -117,18 +180,51 @@ func (f *FaultSession) checkForFailedRecovery(record *api.Record) (bool, []strin
117180 return true , distinctMessages
118181}
119182
183+ /*
184+ Determines whether the fault will leave some pods in a terminated state, and how many pods will be impacted.
185+ This must be run after the fault manifest has been applied and the handler webhook has run.
186+ */
187+ func (f * FaultSession ) checkForMissingPods (records []* api.Record ) error {
188+ if ! f .checkedForMissingPods {
189+ f .checkedForMissingPods = true
190+ // we expect missing pods when the fault is pod kill.
191+
192+ podsInjected := countInjectedPods (records )
193+ log .Infof ("Chaos-mesh has identified %d pods matching the targeting criteria" , podsInjected )
194+ if f .faultType == "PodChaos" && f .faultAction == "pod-kill" {
195+ f .podsExpectedMissing = podsInjected
196+ log .Infof ("We're expecting %d pods to be terminated from the selected fault" , f .podsExpectedMissing )
197+ }
198+ }
199+ return nil
200+ }
201+
202+ func countInjectedPods (records []* api.Record ) int {
203+ podsInjected := 0
204+ for _ , record := range records {
205+ if record .Phase == "Injected" {
206+ podsInjected += 1
207+ }
208+ }
209+ return podsInjected
210+ }
211+
120212// todo: we need a better way of monitoring fault injection status. There's a ton of statefulness represented in
121213// chaos-mesh that we're glancing over. Situations such as a pod crashing during a fault may produce unexpected behavior
122214// in this code as it currently stands.
123215func (f * FaultSession ) GetStatus (ctx context.Context ) (FaultStatus , error ) {
124- records , err := f .getDetailedStatus (ctx )
216+ records , err := f .getFaultRecords (ctx )
125217 if err != nil {
126218 return Error , err
127219 }
128220
129221 if records == nil {
130222 return Starting , nil
131223 }
224+ err = f .checkForMissingPods (records )
225+ if err != nil {
226+ return Error , err
227+ }
132228
133229 podsInjectedAndRecovered := 0
134230 podsInjectedNotRecovered := 0
@@ -152,15 +248,13 @@ func (f *FaultSession) GetStatus(ctx context.Context) (FaultStatus, error) {
152248 }
153249 }
154250
155- // todo: check if unrecovered pods are failing to recover ^^ up here PodRecord.Events[-1].Operation = "Recover", Type="Failed". Emit Message
156-
157251 if podsNotInjected > 0 {
158252 return Starting , nil
159253 }
160- if podsInjectedNotRecovered > 0 && podsInjectedAndRecovered == 0 {
254+ if podsInjectedNotRecovered - f . podsExpectedMissing > 0 && podsInjectedAndRecovered == 0 {
161255 return InProgress , nil
162256 }
163- if podsInjectedAndRecovered + len (f .podsFailingRecovery ) == len (records ) {
257+ if podsInjectedAndRecovered + len (f .podsFailingRecovery )+ f . podsExpectedMissing == len (records ) {
164258 return Completed , nil
165259 }
166260 if podsInjectedNotRecovered > 0 && podsInjectedAndRecovered > 0 {
@@ -182,6 +276,10 @@ func (f *FaultSession) getDuration(ctx context.Context) (*time.Duration, error)
182276 if ! ok {
183277 return nil , stacktrace .NewError ("unable to cast durationVal to string" )
184278 }
279+ if durationStr == nil {
280+ return nil , FaultHasNoDurationErr
281+ }
282+
185283 duration , err := time .ParseDuration (* durationStr )
186284 if err != nil {
187285 return nil , err
0 commit comments