Skip to content

Commit e294f44

Browse files
authored
Merge pull request #55 from crytic/additional-planner-faults
Additional planner faults
2 parents c7da95b + 2c37249 commit e294f44

File tree

12 files changed

+286
-30
lines changed

12 files changed

+286
-30
lines changed

README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,16 @@ Config:
159159
- grace_period: 1800s # how long to wait for health checks to pass before marking the test as failed
160160
```
161161

162+
#### IOLatency
163+
Config:
164+
```yaml
165+
- grace_period: 1800s # how long to wait for health checks to pass before marking the test as failed
166+
delay: 1000ms # how long the i/o delay should be
167+
duration: 1m # how long the fault should last
168+
percent: 50 # the percentage of i/o requests impacted.
169+
```
170+
171+
162172
## Running test suites
163173

164174
Once you've got your configuration set up, you can run Attacknet:
@@ -175,6 +185,14 @@ At this time, health checks will be run in perpetuity once the fault has conclud
175185
**Dec 15, 2023 version v0.1 (internal)**
176186
- Initial internal release
177187

188+
**Jan 11, 2023 version v0.2 (internal)**
189+
- Updated to kurtosis v0.86.1
190+
- Updated to Go 1.21
191+
- Grafana port-forwarding has been temporarily disabled
192+
- Introduces multi-step tests. This allows multiple faults and other actions to be composed into a single test.
193+
- Introduces the suite planner. The suite planner allows the user to define a set of testing criteria/dimensions, which the planner turns into a suite containing multiple tests.
194+
- Successful & failed test suites now emit test artifacts summarizing the results of the test.
195+
-
178196
## Developing (wip)
179197

180198
1. Install pre-commit

pkg/health/checker.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import (
1414

1515
type CheckOrchestrator struct {
1616
checkerImpl types.GenericNetworkChecker
17-
gracePeriod time.Duration
17+
gracePeriod *time.Duration
1818
}
1919

2020
func BuildHealthChecker(cfg *confTypes.ConfigParsed, kubeClient *kubernetes.KubeClient, podsUnderTest []*chaos_mesh.PodUnderTest, healthCheckConfig confTypes.HealthCheckConfig) (*CheckOrchestrator, error) {
@@ -35,7 +35,7 @@ func BuildHealthChecker(cfg *confTypes.ConfigParsed, kubeClient *kubernetes.Kube
3535

3636
func (hc *CheckOrchestrator) RunChecks(ctx context.Context) (*types.HealthCheckResult, error) {
3737
start := time.Now()
38-
latestAllowable := start.Add(hc.gracePeriod)
38+
latestAllowable := start.Add(*hc.gracePeriod)
3939
log.Infof("Allowing up to %.0f seconds for health checks to pass on all nodes", hc.gracePeriod.Seconds())
4040

4141
lastHealthCheckResult := &types.HealthCheckResult{}

pkg/kubernetes/port_forward.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,18 +111,28 @@ func (c *KubeClient) StartPortForwarding(pod string, localPort, remotePort int,
111111
}
112112
log.Debugf("Starting port-forward to pod/%s:%d", pod, remotePort)
113113

114-
go func() {
115-
if err = portForward.ForwardPorts(); err != nil {
116-
panic(stacktrace.Propagate(err, "unable to start port forward session"))
114+
portForwardIssueCh := make(chan error, 1)
115+
defer close(portForwardIssueCh)
116+
117+
retries := 3
118+
go func(retriesRem int) {
119+
for retriesRem > 0 {
120+
if err = portForward.ForwardPorts(); err == nil {
121+
retriesRem--
122+
if retriesRem == 0 {
123+
portForwardIssueCh <- stacktrace.Propagate(err, "unable to start port forward session")
124+
}
125+
}
117126
}
118-
}()
127+
}(retries)
119128

120129
select {
121130
case <-readyCh:
122131
log.Debugf("Port-forward established to pod/%s:%d", pod, remotePort)
123132
case <-time.After(time.Minute):
124133
return nil, errors.New("timed out after waiting to establish port forward")
134+
case err = <-portForwardIssueCh:
135+
return nil, err
125136
}
126-
127137
return stopCh, nil
128138
}

pkg/plan/serialization.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ func SerializeNetworkTopology(nodes []*network.Node, config *network.GenesisConf
1919
"dora",
2020
},
2121
ParallelKeystoreGen: false,
22+
Persistent: true,
2223
}
2324

2425
bs, err := yaml.Marshal(netConfig)

pkg/plan/suite/faults.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@ package suite
22

33
import (
44
"attacknet/cmd/pkg/types"
5+
"fmt"
56
"github.com/kurtosis-tech/stacktrace"
67
"gopkg.in/yaml.v3"
8+
"strings"
9+
"time"
710
)
811

912
// We can't use chaos mesh's types because type-inlining is not supported in yaml.v3, making it so you can't serialize
@@ -55,6 +58,27 @@ type PodChaosWrapper struct {
5558
PodChaosFault `yaml:"chaosFaultSpec"`
5659
}
5760

61+
type IOChaosSpec struct {
62+
Selector `yaml:"selector"`
63+
Mode string `yaml:"mode"`
64+
65+
Action string `yaml:"action"`
66+
VolumePath string `yaml:"volumePath"`
67+
Delay *time.Duration `yaml:"delay"`
68+
Percent int `yaml:"percent"`
69+
Duration *time.Duration `yaml:"duration"`
70+
}
71+
72+
type IOChaosFault struct {
73+
Spec IOChaosSpec `yaml:"spec"`
74+
Kind string `yaml:"kind"`
75+
ApiVersion string `yaml:"apiVersion"`
76+
}
77+
78+
type IOChaosWrapper struct {
79+
IOChaosFault `yaml:"chaosFaultSpec"`
80+
}
81+
5882
func convertFaultSpecToMap(s interface{}) (map[string]interface{}, error) {
5983
// convert to map[string]interface{} using yaml intermediate. seriously.
6084
bs, err := yaml.Marshal(s)
@@ -128,3 +152,62 @@ func buildPodRestartFault(description string, expressionSelectors []ChaosExpress
128152
}
129153
return step, nil
130154
}
155+
156+
func getVolumePathForIOFault(podName string) (string, error) {
157+
var nodeType string
158+
parts := strings.Split(podName, "-")
159+
if parts[0] == "el" {
160+
nodeType = "execution"
161+
} else {
162+
nodeType = "consensus"
163+
}
164+
if parts[len(parts)-1] == "validator" {
165+
return "", stacktrace.NewError("cannot create an i/o latency fault on a validator sidecar pod. Try to target matching clients only: %s", podName)
166+
}
167+
clientName := parts[2]
168+
volumeTarget := fmt.Sprintf("/data/%s/%s-data", clientName, nodeType)
169+
return volumeTarget, nil
170+
}
171+
172+
func buildIOLatencyFault(description string, expressionSelector ChaosExpressionSelector, delay *time.Duration, percent int, duration *time.Duration) ([]types.PlanStep, error) {
173+
var steps []types.PlanStep
174+
175+
for _, podName := range expressionSelector.Values {
176+
volumePath, err := getVolumePathForIOFault(podName)
177+
if err != nil {
178+
return nil, err
179+
}
180+
181+
t := IOChaosWrapper{
182+
IOChaosFault: IOChaosFault{
183+
Kind: "IOChaos",
184+
ApiVersion: "chaos-mesh.org/v1alpha1",
185+
Spec: IOChaosSpec{
186+
Duration: duration,
187+
Mode: "all",
188+
Selector: Selector{
189+
ExpressionSelectors: []ChaosExpressionSelector{expressionSelector},
190+
},
191+
Action: "latency",
192+
VolumePath: volumePath,
193+
Delay: delay,
194+
Percent: percent,
195+
},
196+
},
197+
}
198+
199+
faultSpec, err := convertFaultSpecToMap(t)
200+
if err != nil {
201+
return nil, err
202+
}
203+
204+
step := types.PlanStep{
205+
StepType: types.InjectFault,
206+
StepDescription: description,
207+
Spec: faultSpec,
208+
}
209+
steps = append(steps, step)
210+
}
211+
212+
return steps, nil
213+
}

pkg/plan/suite/step_builder.go

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ import (
44
"attacknet/cmd/pkg/plan/network"
55
"attacknet/cmd/pkg/types"
66
"fmt"
7+
"github.com/kurtosis-tech/stacktrace"
78
log "github.com/sirupsen/logrus"
9+
"time"
810
)
911

1012
type clientType string
@@ -33,9 +35,9 @@ func composeWaitForFaultCompletionStep() *types.PlanStep {
3335
return &types.PlanStep{StepType: types.WaitForFaultCompletion, StepDescription: "wait for faults to terminate"}
3436
}
3537

36-
func composeNodeClockSkewPlanSteps(nodesSelected []*ChaosTargetSelector, skew, duration string) ([]types.PlanStep, error) {
38+
func composeNodeClockSkewPlanSteps(targetsSelected []*ChaosTargetSelector, skew, duration string) ([]types.PlanStep, error) {
3739
var steps []types.PlanStep
38-
for _, target := range nodesSelected {
40+
for _, target := range targetsSelected {
3941
description := fmt.Sprintf("Inject clock skew on target %s", target.Description)
4042

4143
skewStep, err := buildClockSkewFault(description, skew, duration, target.Selector)
@@ -48,10 +50,10 @@ func composeNodeClockSkewPlanSteps(nodesSelected []*ChaosTargetSelector, skew, d
4850
return steps, nil
4951
}
5052

51-
func composeNodeRestartSteps(nodesSelected []*ChaosTargetSelector) ([]types.PlanStep, error) {
53+
func composeNodeRestartSteps(targetsSelected []*ChaosTargetSelector) ([]types.PlanStep, error) {
5254
var steps []types.PlanStep
5355

54-
for _, target := range nodesSelected {
56+
for _, target := range targetsSelected {
5557
description := fmt.Sprintf("Restart target %s", target.Description)
5658
restartStep, err := buildPodRestartFault(description, target.Selector)
5759

@@ -63,3 +65,39 @@ func composeNodeRestartSteps(nodesSelected []*ChaosTargetSelector) ([]types.Plan
6365

6466
return steps, nil
6567
}
68+
69+
func areExprSelectorsMatchingIdIn(expressionSelectors []ChaosExpressionSelector) error {
70+
for _, selector := range expressionSelectors {
71+
if selector.Key != "kurtosistech.com/id" {
72+
return stacktrace.NewError("i/o latency faults can only be target using pod id: %s", selector.Key)
73+
}
74+
if selector.Operator != "In" {
75+
return stacktrace.NewError("i/o latency faults can only be target using the 'In' operator: %s", selector.Operator)
76+
}
77+
}
78+
return nil
79+
}
80+
81+
func composeIOLatencySteps(targetsSelected []*ChaosTargetSelector, delay *time.Duration, percent int, duration *time.Duration) ([]types.PlanStep, error) {
82+
var steps []types.PlanStep
83+
84+
for _, target := range targetsSelected {
85+
description := fmt.Sprintf("Inject i/o latency on target %s", target.Description)
86+
err := areExprSelectorsMatchingIdIn(target.Selector)
87+
if err != nil {
88+
return nil, err
89+
}
90+
91+
// for i/o faults, we need to create a plan step for each individual pod because the fault spec has to say the data path.
92+
for _, selector := range target.Selector {
93+
ioLatencySteps, err := buildIOLatencyFault(description, selector, delay, percent, duration)
94+
if err != nil {
95+
return nil, err
96+
}
97+
steps = append(steps, ioLatencySteps...)
98+
}
99+
}
100+
101+
return steps, nil
102+
103+
}

pkg/plan/suite/suite_builder.go

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"github.com/kurtosis-tech/stacktrace"
88
log "github.com/sirupsen/logrus"
9+
"strconv"
910
"time"
1011
)
1112

@@ -72,12 +73,25 @@ func ComposeTestSuite(
7273
}
7374
}
7475
}
75-
76-
log.Infof("ESTIMATE: Running this test suite will take, at minimum, %d minutes", runtimeEstimate/60)
76+
log.Infof("Tests generated: %d", len(tests))
77+
log.Infof("ESTIMATE: Running this test suite will take, at minimum, %d minutes based on fault durations.", runtimeEstimate/60)
7778

7879
return tests, nil
7980
}
8081

82+
func getDurationValue(key string, m map[string]string) (*time.Duration, error) {
83+
84+
valueStr, ok := m[key]
85+
if !ok {
86+
return nil, stacktrace.NewError("missing %s field", key)
87+
}
88+
duration, err := time.ParseDuration(valueStr)
89+
if err != nil {
90+
return nil, stacktrace.NewError("unable to convert %s field to a time duration", key)
91+
}
92+
return &duration, nil
93+
}
94+
8195
func composeTestForFaultType(
8296
faultType FaultTypeEnum,
8397
config map[string]string,
@@ -95,28 +109,46 @@ func composeTestForFaultType(
95109
if !ok {
96110
return nil, stacktrace.NewError("missing duration field for clock skew fault")
97111
}
98-
grace, ok := config["grace_period"]
99-
if !ok {
100-
return nil, stacktrace.NewError("missing grace_period field for clock skew fault")
101-
}
102-
graceDuration, err := time.ParseDuration(grace)
112+
graceDuration, err := getDurationValue("grace_period", config)
103113
if err != nil {
104-
return nil, stacktrace.NewError("unable to convert grace_period field to a time duration for clock skew fault")
114+
return nil, err
105115
}
106116

107117
description := fmt.Sprintf("Apply %s clock skew for %s against %d targets. %s", skew, duration, len(targetSelectors), targetingDescription)
108118
return composeNodeClockSkewTest(description, targetSelectors, skew, duration, graceDuration)
109119
case FaultContainerRestart:
110-
grace, ok := config["grace_period"]
111-
if !ok {
112-
return nil, stacktrace.NewError("missing grace_period field for restsrt fault")
113-
}
114-
graceDuration, err := time.ParseDuration(grace)
120+
121+
graceDuration, err := getDurationValue("grace_period", config)
115122
if err != nil {
116-
return nil, stacktrace.NewError("unable to convert grace_period field to a time duration for clock skew fault")
123+
return nil, err
117124
}
118125
description := fmt.Sprintf("Restarting %d targets. %s", len(targetSelectors), targetingDescription)
119126
return composeNodeRestartTest(description, targetSelectors, graceDuration)
127+
case FaultIOLatency:
128+
grace, err := getDurationValue("grace_period", config)
129+
if err != nil {
130+
return nil, err
131+
}
132+
delay, err := getDurationValue("delay", config)
133+
if err != nil {
134+
return nil, err
135+
}
136+
faultDuration, err := getDurationValue("duration", config)
137+
if err != nil {
138+
return nil, err
139+
}
140+
141+
percent, ok := config["percent"]
142+
if !ok {
143+
return nil, stacktrace.NewError("missing percent field in io latency fault")
144+
}
145+
percentInt, err := strconv.Atoi(percent)
146+
if err != nil {
147+
return nil, stacktrace.Propagate(err, "unable to parse io latency fault percent field")
148+
}
149+
description := fmt.Sprintf("Apply %s i/o latency for %s. Impacting %d pct of i/o calls. against %d targets. %s", delay, faultDuration, percentInt, len(targetSelectors), targetingDescription)
150+
151+
return composeIOLatencyTest(description, targetSelectors, delay, percentInt, faultDuration, grace)
120152
}
121153

122154
return nil, nil

0 commit comments

Comments
 (0)