forked from litmuschaos/litmus-go
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathssm-chaos.go
180 lines (150 loc) · 7.54 KB
/
ssm-chaos.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
package lib
import (
"context"
"os"
"strings"
"time"
experimentTypes "github.com/litmuschaos/litmus-go/pkg/aws-ssm/aws-ssm-chaos/types"
"github.com/litmuschaos/litmus-go/pkg/clients"
"github.com/litmuschaos/litmus-go/pkg/cloud/aws/ssm"
"github.com/litmuschaos/litmus-go/pkg/events"
"github.com/litmuschaos/litmus-go/pkg/log"
"github.com/litmuschaos/litmus-go/pkg/probe"
"github.com/litmuschaos/litmus-go/pkg/telemetry"
"github.com/litmuschaos/litmus-go/pkg/types"
"github.com/litmuschaos/litmus-go/pkg/utils/common"
"github.com/palantir/stacktrace"
"go.opentelemetry.io/otel"
)
// InjectChaosInSerialMode will inject the aws ssm chaos in serial mode that is one after other
func InjectChaosInSerialMode(ctx context.Context, experimentsDetails *experimentTypes.ExperimentDetails, instanceIDList []string, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails, inject chan os.Signal) error {
ctx, span := otel.Tracer(telemetry.TracerName).Start(ctx, "InjectAWSSSMFaultInSerialMode")
defer span.End()
select {
case <-inject:
// stopping the chaos execution, if abort signal received
os.Exit(0)
default:
//ChaosStartTimeStamp contains the start timestamp, when the chaos injection begin
ChaosStartTimeStamp := time.Now()
duration := int(time.Since(ChaosStartTimeStamp).Seconds())
for duration < experimentsDetails.ChaosDuration {
log.Infof("[Info]: Target instanceID list, %v", instanceIDList)
if experimentsDetails.EngineName != "" {
msg := "Injecting " + experimentsDetails.ExperimentName + " chaos on ec2 instance"
types.SetEngineEventAttributes(eventsDetails, types.ChaosInject, msg, "Normal", chaosDetails)
events.GenerateEvents(eventsDetails, clients, chaosDetails, "ChaosEngine")
}
//Running SSM command on the instance
for i, ec2ID := range instanceIDList {
//Sending AWS SSM command
log.Info("[Chaos]: Starting the ssm command")
ec2IDList := strings.Fields(ec2ID)
commandId, err := ssm.SendSSMCommand(experimentsDetails, ec2IDList)
if err != nil {
return stacktrace.Propagate(err, "failed to send ssm command")
}
//prepare commands for abort recovery
experimentsDetails.CommandIDs = append(experimentsDetails.CommandIDs, commandId)
//wait for the ssm command to get in running state
log.Info("[Wait]: Waiting for the ssm command to get in InProgress state")
if err := ssm.WaitForCommandStatus("InProgress", commandId, ec2ID, experimentsDetails.Region, experimentsDetails.ChaosDuration+experimentsDetails.Timeout, experimentsDetails.Delay); err != nil {
return stacktrace.Propagate(err, "failed to start ssm command")
}
common.SetTargets(ec2ID, "injected", "EC2", chaosDetails)
// run the probes during chaos
if len(resultDetails.ProbeDetails) != 0 && i == 0 {
if err = probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
return stacktrace.Propagate(err, "failed to run probes")
}
}
//wait for the ssm command to get succeeded in the given chaos duration
log.Info("[Wait]: Waiting for the ssm command to get completed")
if err := ssm.WaitForCommandStatus("Success", commandId, ec2ID, experimentsDetails.Region, experimentsDetails.ChaosDuration+experimentsDetails.Timeout, experimentsDetails.Delay); err != nil {
return stacktrace.Propagate(err, "failed to send ssm command")
}
common.SetTargets(ec2ID, "reverted", "EC2", chaosDetails)
//Wait for chaos interval
log.Infof("[Wait]: Waiting for chaos interval of %vs", experimentsDetails.ChaosInterval)
time.Sleep(time.Duration(experimentsDetails.ChaosInterval) * time.Second)
}
duration = int(time.Since(ChaosStartTimeStamp).Seconds())
}
}
return nil
}
// InjectChaosInParallelMode will inject the aws ssm chaos in parallel mode that is all at once
func InjectChaosInParallelMode(ctx context.Context, experimentsDetails *experimentTypes.ExperimentDetails, instanceIDList []string, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails, inject chan os.Signal) error {
ctx, span := otel.Tracer(telemetry.TracerName).Start(ctx, "InjectAWSSSMFaultInParallelMode")
defer span.End()
select {
case <-inject:
// stopping the chaos execution, if abort signal received
os.Exit(0)
default:
//ChaosStartTimeStamp contains the start timestamp, when the chaos injection begin
ChaosStartTimeStamp := time.Now()
duration := int(time.Since(ChaosStartTimeStamp).Seconds())
for duration < experimentsDetails.ChaosDuration {
log.Infof("[Info]: Target instanceID list, %v", instanceIDList)
if experimentsDetails.EngineName != "" {
msg := "Injecting " + experimentsDetails.ExperimentName + " chaos on ec2 instance"
types.SetEngineEventAttributes(eventsDetails, types.ChaosInject, msg, "Normal", chaosDetails)
events.GenerateEvents(eventsDetails, clients, chaosDetails, "ChaosEngine")
}
//Sending AWS SSM command
log.Info("[Chaos]: Starting the ssm command")
commandId, err := ssm.SendSSMCommand(experimentsDetails, instanceIDList)
if err != nil {
return stacktrace.Propagate(err, "failed to send ssm command")
}
//prepare commands for abort recovery
experimentsDetails.CommandIDs = append(experimentsDetails.CommandIDs, commandId)
for _, ec2ID := range instanceIDList {
//wait for the ssm command to get in running state
log.Info("[Wait]: Waiting for the ssm command to get in InProgress state")
if err := ssm.WaitForCommandStatus("InProgress", commandId, ec2ID, experimentsDetails.Region, experimentsDetails.ChaosDuration+experimentsDetails.Timeout, experimentsDetails.Delay); err != nil {
return stacktrace.Propagate(err, "failed to start ssm command")
}
}
// run the probes during chaos
if len(resultDetails.ProbeDetails) != 0 {
if err = probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
return stacktrace.Propagate(err, "failed to run probes")
}
}
for _, ec2ID := range instanceIDList {
//wait for the ssm command to get succeeded in the given chaos duration
log.Info("[Wait]: Waiting for the ssm command to get completed")
if err := ssm.WaitForCommandStatus("Success", commandId, ec2ID, experimentsDetails.Region, experimentsDetails.ChaosDuration+experimentsDetails.Timeout, experimentsDetails.Delay); err != nil {
return stacktrace.Propagate(err, "failed to send ssm command")
}
}
//Wait for chaos interval
log.Infof("[Wait]: Waiting for chaos interval of %vs", experimentsDetails.ChaosInterval)
time.Sleep(time.Duration(experimentsDetails.ChaosInterval) * time.Second)
duration = int(time.Since(ChaosStartTimeStamp).Seconds())
}
}
return nil
}
// AbortWatcher will be watching for the abort signal and revert the chaos
func AbortWatcher(experimentsDetails *experimentTypes.ExperimentDetails, abort chan os.Signal) {
<-abort
log.Info("[Abort]: Chaos Revert Started")
switch {
case len(experimentsDetails.CommandIDs) != 0:
for _, commandId := range experimentsDetails.CommandIDs {
if err := ssm.CancelCommand(commandId, experimentsDetails.Region); err != nil {
log.Errorf("[Abort]: Failed to cancel command, recovery failed: %v", err)
}
}
default:
log.Info("[Abort]: No SSM Command found to cancel")
}
if err := ssm.SSMDeleteDocument(experimentsDetails.DocumentName, experimentsDetails.Region); err != nil {
log.Errorf("Failed to delete ssm document: %v", err)
}
log.Info("[Abort]: Chaos Revert Completed")
os.Exit(1)
}