Skip to content

Commit 24fec14

Browse files
Merge branch 'litmuschaos:master' into singleton-client
2 parents d5c5295 + c0c93c1 commit 24fec14

10 files changed

Lines changed: 261 additions & 46 deletions

File tree

CONTRIBUTING.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ We welcome contributions of all kinds:
1515
## Getting Started
1616

1717
### Ask Questions or Discuss Ideas
18-
- Join our community on [Slack](http://slack.litmuschaos.io) and post in **#litmus** for general questions or **#litmus-dev** for technical discussions.
18+
- Join our community on [Slack](https://slack.litmuschaos.io) and post in **#litmus** for general questions or **#litmus-dev** for technical discussions.
1919

2020
### Report Issues or Propose Changes
2121
- Open a new [GitHub issue](https://github.com/litmuschaos/litmus/issues/new) describing your request, bug, or feature idea.
@@ -27,7 +27,7 @@ We welcome contributions of all kinds:
2727
- **Manual setup**
2828
Follow the [Local Development Guide](https://github.com/litmuschaos/litmus/wiki/ChaosCenter-Development-Guide) if you prefer to set up and configure the environment yourself.
2929
- Review [Development Best Practices](https://github.com/litmuschaos/litmus/wiki/Development-Best-Practices)
30-
- For Go contributors, read [Effective Go](https://golang.org/doc/effective_go.html) and [Go Code Review Comments](https://go.dev/wiki/CodeReviewComments)
30+
- For Go contributors, read [Effective Go](https://go.dev/doc/effective_go) and [Go Code Review Comments](https://go.dev/wiki/CodeReviewComments)
3131

3232
You can contribute fixes and improvements by submitting a Pull Request (PR) on GitHub. Each PR will be reviewed by one or more maintainers and merged once it meets the project’s standards.
3333

@@ -64,8 +64,8 @@ For full details, see the [DCO documentation](https://developercertificate.org/)
6464

6565
3. **Develop & Test**
6666
- Follow backend and frontend coding guidelines:
67-
- **Backend:** [Go Code Review Comments](https://code.google.com/p/go-wiki/wiki/CodeReviewComments), [Best Practices](https://peter.bourgon.org/go-in-production/#formatting-and-style)
68-
- **Frontend:** [Airbnb React Style Guide](https://airbnb.io/javascript/react/)
67+
- **Backend:** [Go Code Review Comments](https://go.dev/wiki/CodeReviewComments), [Best Practices](https://peter.bourgon.org/go-in-production/#formatting-and-style)
68+
- **Frontend:** [Airbnb React Style Guide](https://javascript.airbnb.tech)
6969

7070
- If you are making any changes in backend, make sure you have run and tested the code locally, the reviewers might ask for relevant screenshots in the comments.
7171
- Include relevant tests for new code or bug fixes
@@ -82,7 +82,7 @@ For full details, see the [DCO documentation](https://developercertificate.org/)
8282
- Merge occurs when all checks pass and reviews are approved
8383
- If your PR is large or high-impact, coordinate with maintainers in **#litmus-dev** Slack
8484

85-
If you are new to Go, consider reading [Effective Go](https://golang.org/doc/effective_go.html) and [Go Code Review Comments](https://github.com/golang/go/wiki/CodeReviewComments) for guidance on writing idiomatic Go code.
85+
If you are new to Go, consider reading [Effective Go](https://go.dev/doc/effective_go) and [Go Code Review Comments](https://go.dev/wiki/CodeReviewComments) for guidance on writing idiomatic Go code.
8686

8787
## Pull Request Checklist
8888
- [ ] Rebase to the current master branch before submitting your pull request.

chaoscenter/event-tracker/controllers/eventtrackerpolicy_controller.go

Lines changed: 83 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@ import (
2020
"context"
2121
"encoding/json"
2222
"strings"
23-
"sync"
2423

2524
"github.com/litmuschaos/litmus/chaoscenter/event-tracker/pkg/utils"
2625
"github.com/sirupsen/logrus"
2726
"k8s.io/apimachinery/pkg/api/errors"
27+
"k8s.io/client-go/util/retry"
2828

2929
"k8s.io/apimachinery/pkg/runtime"
3030
ctrl "sigs.k8s.io/controller-runtime"
@@ -62,50 +62,99 @@ type apiResponse struct {
6262
func (r *EventTrackerPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
6363
_ = log.FromContext(ctx)
6464

65-
var mutex = &sync.Mutex{}
66-
mutex.Lock()
65+
// Phase 1: atomically claim pending triggers by flipping IsTriggered to "true"
66+
// before any side effect. A concurrent reconcile that loses the CAS will re-read
67+
// and observe the winner's claim, so each trigger is owned by exactly one loop.
68+
var claimed []string
69+
err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
70+
claimed = nil
71+
72+
var etp eventtrackerv1.EventTrackerPolicy
73+
if err := r.Client.Get(ctx, req.NamespacedName, &etp); err != nil {
74+
if errors.IsNotFound(err) {
75+
logrus.Infof("EventTrackerPolicy %s not found", req.NamespacedName)
76+
return nil
77+
}
78+
return err
79+
}
6780

68-
var etp eventtrackerv1.EventTrackerPolicy
69-
err := r.Client.Get(context.Background(), req.NamespacedName, &etp)
70-
if errors.IsNotFound(err) {
71-
logrus.Infof("namespace: %s not found", req.NamespacedName)
72-
return ctrl.Result{}, nil
73-
} else if err != nil {
81+
for i, s := range etp.Statuses {
82+
if s.Result == utils.ConditionPassed && strings.ToLower(s.IsTriggered) == "false" {
83+
etp.Statuses[i].IsTriggered = "true"
84+
claimed = append(claimed, s.ExperimentID)
85+
logrus.Printf("claiming trigger resource=%s experimentID=%s", s.ResourceName, s.ExperimentID)
86+
}
87+
}
88+
if len(claimed) == 0 {
89+
return nil
90+
}
91+
return r.Client.Update(ctx, &etp)
92+
})
93+
if err != nil {
7494
return ctrl.Result{}, err
7595
}
7696

77-
for index, status := range etp.Statuses {
78-
if status.Result == utils.ConditionPassed && strings.ToLower(status.IsTriggered) == "false" {
79-
logrus.Print("ResourceName: " + status.ResourceName + ", ExperimentID: " + status.ExperimentID)
80-
response, err := utils.SendRequest(status.ExperimentID)
81-
if err != nil {
82-
return ctrl.Result{}, err
83-
}
84-
85-
logrus.Print(response)
97+
// Phase 2: side-effect dispatch. Executed outside the retry loop so each
98+
// claim triggers exactly one SendRequest. On a Gitops-Disabled response we
99+
// restore IsTriggered="false" to match the pre-fix behavior (retryable).
100+
var toRevert []string
101+
var dispatchErr error
102+
for _, experimentID := range claimed {
103+
response, sendErr := utils.SendRequest(experimentID)
104+
if sendErr != nil {
105+
logrus.WithError(sendErr).Errorf("failed to trigger experiment %s", experimentID)
106+
toRevert = append(toRevert, experimentID)
107+
dispatchErr = sendErr
108+
continue
109+
}
86110

87-
var res apiResponse
88-
err = json.Unmarshal([]byte(response), &res)
89-
if err != nil {
90-
return ctrl.Result{}, err
91-
}
111+
var res apiResponse
112+
if jsonErr := json.Unmarshal([]byte(response), &res); jsonErr != nil {
113+
logrus.WithError(jsonErr).Errorf("failed to parse response for experiment %s", experimentID)
114+
toRevert = append(toRevert, experimentID)
115+
dispatchErr = jsonErr
116+
continue
117+
}
92118

93-
if res.Data.GitopsNotifer == "Gitops Disabled" {
94-
etp.Statuses[index].IsTriggered = "false"
95-
} else {
96-
etp.Statuses[index].IsTriggered = "true"
97-
}
119+
if res.Data.GitopsNotifer == "Gitops Disabled" {
120+
logrus.Infof("gitops disabled for experiment %s; releasing claim", experimentID)
121+
toRevert = append(toRevert, experimentID)
122+
continue
98123
}
124+
logrus.Infof("successfully triggered experiment %s", experimentID)
99125
}
100126

101-
err = r.Client.Update(context.Background(), &etp)
102-
if err != nil {
103-
return ctrl.Result{}, err
127+
if len(toRevert) > 0 {
128+
revertErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
129+
var etp eventtrackerv1.EventTrackerPolicy
130+
if err := r.Client.Get(ctx, req.NamespacedName, &etp); err != nil {
131+
if errors.IsNotFound(err) {
132+
return nil
133+
}
134+
return err
135+
}
136+
revertSet := make(map[string]struct{}, len(toRevert))
137+
for _, id := range toRevert {
138+
revertSet[id] = struct{}{}
139+
}
140+
changed := false
141+
for i, s := range etp.Statuses {
142+
if _, ok := revertSet[s.ExperimentID]; ok && strings.ToLower(s.IsTriggered) == "true" {
143+
etp.Statuses[i].IsTriggered = "false"
144+
changed = true
145+
}
146+
}
147+
if !changed {
148+
return nil
149+
}
150+
return r.Client.Update(ctx, &etp)
151+
})
152+
if revertErr != nil {
153+
return ctrl.Result{}, revertErr
154+
}
104155
}
105156

106-
defer mutex.Unlock()
107-
108-
return ctrl.Result{}, nil
157+
return ctrl.Result{}, dispatchErr
109158
}
110159

111160
// SetupWithManager sets up the controller with the Manager.

chaoscenter/graphql/server/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# BUILD STAGE
2-
FROM golang:1.24 AS builder
2+
FROM golang:1.26 AS builder
33

44
LABEL maintainer="LitmusChaos"
55

@@ -17,7 +17,7 @@ RUN CGO_ENABLED=0 go build -o /output/server -v
1717

1818
# DEPLOY STAGE
1919
# Use Red Hat UBI minimal image as base
20-
FROM registry.access.redhat.com/ubi9/ubi-minimal:9.6
20+
FROM registry.access.redhat.com/ubi9/ubi-minimal:9.7
2121

2222
LABEL maintainer="LitmusChaos"
2323

chaoscenter/graphql/server/pkg/chaos_experiment/handler/handler.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1551,7 +1551,14 @@ func (c *ChaosExperimentHandler) StopExperimentRuns(ctx context.Context, project
15511551
}
15521552

15531553
for _, runID := range experimentRunsID {
1554-
err = c.chaosExperimentRunService.ProcessExperimentRunStop(ctx, query, &runID, experiment, username, projectID, r)
1554+
// scope the update to the specific run so we don't accidentally touch sibling runs
1555+
runQuery := bson.D{
1556+
{"experiment_id", experimentID},
1557+
{"project_id", projectID},
1558+
{"experiment_run_id", runID},
1559+
{"is_removed", false},
1560+
}
1561+
err = c.chaosExperimentRunService.ProcessExperimentRunStop(ctx, runQuery, &runID, experiment, username, projectID, r)
15551562
if err != nil {
15561563
return false, err
15571564
}

chaoscenter/graphql/server/pkg/chaos_experiment_run/service.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,15 @@ func (c *chaosExperimentRunService) ProcessExperimentRunDelete(ctx context.Conte
7171
return nil
7272
}
7373

74-
// ProcessExperimentRunStop deletes a workflow entry and updates the database
74+
// ProcessExperimentRunStop marks an experiment run as stopped in the database and notifies
75+
// the infra subscriber if it's currently connected. Updating the DB here is important because
76+
// the infra might be disconnected (e.g. agent redeployed), in which case the stop signal
77+
// never reaches the agent and the run would be stuck in Running state forever.
7578
func (c *chaosExperimentRunService) ProcessExperimentRunStop(ctx context.Context, query bson.D, experimentRunID *string, experiment dbChaosExperiment.ChaosExperimentRequest, username string, projectID string, r *store.StateData) error {
7679
update := bson.D{
7780
{"$set", bson.D{
81+
{"phase", string(model.ExperimentRunStatusStopped)},
82+
{"completed", true},
7883
{"updated_at", time.Now().UnixMilli()},
7984
{"updated_by", mongodb.UserDetailResponse{
8085
Username: username,
@@ -86,6 +91,8 @@ func (c *chaosExperimentRunService) ProcessExperimentRunStop(ctx context.Context
8691
if err != nil {
8792
return err
8893
}
94+
// Best-effort: notify the agent to clean up in-flight resources. If the infra
95+
// is not connected the channel send is a no-op and the DB state is still correct.
8996
if r != nil {
9097
chaos_infrastructure.SendExperimentToSubscriber(projectID, &model.ChaosExperimentRequest{
9198
InfraID: experiment.InfraID,

chaoscenter/graphql/server/pkg/chaos_experiment_run/service_test.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,97 @@ func Test_chaosExperimentRunService_ProcessExperimentRunDelete(t *testing.T) {
155155
}
156156
}
157157

158+
func Test_chaosExperimentRunService_ProcessExperimentRunStop(t *testing.T) {
159+
type args struct {
160+
ctx context.Context
161+
query bson.D
162+
experimentRun dbChaosExperimentRun.ChaosExperimentRun
163+
workflow dbChaosExperiment.ChaosExperimentRequest
164+
username string
165+
r *store.StateData
166+
}
167+
experimentRunID := uuid.New().String()
168+
projectID := uuid.New().String()
169+
experimentID := uuid.New().String()
170+
infraID := uuid.New().String()
171+
172+
testcases := []struct {
173+
name string
174+
args args
175+
given func()
176+
wantErr bool
177+
}{
178+
{
179+
name: "success: stop marks run as Stopped in DB even when infra is disconnected",
180+
args: args{
181+
ctx: context.Background(),
182+
query: bson.D{
183+
{Key: "experiment_id", Value: experimentID},
184+
{Key: "project_id", Value: projectID},
185+
{Key: "experiment_run_id", Value: experimentRunID},
186+
{Key: "is_removed", Value: false},
187+
},
188+
experimentRun: dbChaosExperimentRun.ChaosExperimentRun{
189+
ProjectID: projectID,
190+
ExperimentID: experimentID,
191+
ExperimentRunID: experimentRunID,
192+
InfraID: infraID,
193+
Phase: "Running",
194+
},
195+
workflow: dbChaosExperiment.ChaosExperimentRequest{
196+
ProjectID: projectID,
197+
InfraID: infraID,
198+
ExperimentID: experimentID,
199+
},
200+
username: "test",
201+
// nil store means no subscriber channel — simulates disconnected infra
202+
r: nil,
203+
},
204+
given: func() {
205+
mongodbMockOperator.On("Update", mock.Anything, mongodb.ChaosExperimentRunsCollection, mock.Anything, mock.Anything, mock.Anything).Return(&mongo.UpdateResult{MatchedCount: 1}, nil).Once()
206+
},
207+
wantErr: false,
208+
},
209+
{
210+
name: "failure: DB error propagates",
211+
args: args{
212+
ctx: context.Background(),
213+
query: bson.D{
214+
{Key: "experiment_id", Value: experimentID},
215+
{Key: "project_id", Value: projectID},
216+
{Key: "experiment_run_id", Value: experimentRunID},
217+
{Key: "is_removed", Value: false},
218+
},
219+
experimentRun: dbChaosExperimentRun.ChaosExperimentRun{
220+
ProjectID: projectID,
221+
ExperimentID: experimentID,
222+
ExperimentRunID: experimentRunID,
223+
InfraID: infraID,
224+
},
225+
workflow: dbChaosExperiment.ChaosExperimentRequest{
226+
ProjectID: projectID,
227+
InfraID: infraID,
228+
ExperimentID: experimentID,
229+
},
230+
username: "test",
231+
r: nil,
232+
},
233+
given: func() {
234+
mongodbMockOperator.On("Update", mock.Anything, mongodb.ChaosExperimentRunsCollection, mock.Anything, mock.Anything, mock.Anything).Return(&mongo.UpdateResult{MatchedCount: 0}, errors.New("db error")).Once()
235+
},
236+
wantErr: true,
237+
},
238+
}
239+
for _, tc := range testcases {
240+
tc.given()
241+
t.Run(tc.name, func(t *testing.T) {
242+
if err := chaosExperimentRunTestService.ProcessExperimentRunStop(tc.args.ctx, tc.args.query, &experimentRunID, tc.args.workflow, tc.args.username, projectID, tc.args.r); (err != nil) != tc.wantErr {
243+
t.Errorf("chaosExperimentRunService.ProcessExperimentRunStop() error = %v, wantErr %v", err, tc.wantErr)
244+
}
245+
})
246+
}
247+
}
248+
158249
func Test_chaosExperimentRunService_ProcessCompletedExperimentRun(t *testing.T) {
159250
type args struct {
160251
execData ExecutionData

chaoscenter/graphql/server/pkg/chaos_infrastructure/service.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,9 @@ func (in *infraService) GetVersionDetails() (*model.InfraVersionDetails, error)
899899
compatibleVersions := utils.Config.InfraCompatibleVersions
900900

901901
var compatibleArray []string
902-
_ = json.Unmarshal([]byte(compatibleVersions), &compatibleArray)
902+
if err := json.Unmarshal([]byte(compatibleVersions), &compatibleArray); err != nil {
903+
return &model.InfraVersionDetails{}, fmt.Errorf("failed to parse InfraCompatibleVersions config: %w", err)
904+
}
903905

904906
// To find the latest compatible version
905907
compatibleMap := make(map[int]string)

0 commit comments

Comments
 (0)