Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions cmd/chaos-pick-node.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
package cmd

import (
"fmt"
"slices"

"github.com/couchbaselabs/cbdinocluster/utils/clustercontrol"
"github.com/spf13/cobra"
"go.uber.org/zap"
)

var chaosPickNodeCmd = &cobra.Command{
Use: "pick-node <cluster-id>",
Short: "Picks a single non-orchestrator node from the cluster for chaos testing",
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
helper := CmdHelper{}
logger := helper.GetLogger()
ctx := helper.GetContext()

clusterId := args[0]
selectOrchestrator, _ := cmd.Flags().GetBool("orchestrator")

_, _, cluster := helper.IdentifyCluster(ctx, clusterId)

// collect only cluster nodes (exclude utility nodes like load balancers)
type candidateNode struct {
ID string
IPAddress string
}

var candidates []candidateNode
for _, node := range cluster.GetNodes() {
if !node.IsClusterNode() {
continue
}
candidates = append(candidates, candidateNode{
ID: node.GetID(),
IPAddress: node.GetIPAddress(),
})
}

if len(candidates) == 0 {
logger.Fatal("no cluster nodes found")
}

// sort for deterministic selection
slices.SortFunc(candidates, func(a, b candidateNode) int {
if a.ID < b.ID {
return -1
}
if a.ID > b.ID {
return 1
}
return 0
})

// query ns_server to find the orchestrator
firstNodeIP := candidates[0].IPAddress
ctrl := &clustercontrol.Controller{
Logger: logger,
Endpoint: fmt.Sprintf("http://%s:8091", firstNodeIP),
}

terseInfo, err := ctrl.GetTerseClusterInfo(ctx)
if err != nil {
logger.Fatal("failed to get terse cluster info", zap.Error(err))
}

orchestratorOTP := terseInfo.Orchestrator

// resolve orchestrator OTP to a node IP by querying each node
var orchestratorIP string
for _, c := range candidates {
nodeCtrl := &clustercontrol.Controller{
Logger: logger,
Endpoint: fmt.Sprintf("http://%s:8091", c.IPAddress),
}

localInfo, err := nodeCtrl.GetLocalInfo(ctx)
if err != nil {
logger.Warn("failed to get local info for node, skipping",
zap.String("node", c.ID),
zap.Error(err))
continue
}

if localInfo.OTPNode == orchestratorOTP {
orchestratorIP = c.IPAddress
break
}
}

if selectOrchestrator {
// select the orchestrator specifically
if orchestratorIP == "" {
logger.Fatal("could not identify orchestrator node")
}

var orchestrator *candidateNode
for _, c := range candidates {
if c.IPAddress == orchestratorIP {
foundOrchestrator := c
orchestrator = &foundOrchestrator
break
}
}

candidates = []candidateNode{*orchestrator}
Comment thread
brett19 marked this conversation as resolved.
} else {
// exclude the orchestrator (default behavior)
if orchestratorIP != "" {
var filtered []candidateNode
for _, c := range candidates {
if c.IPAddress != orchestratorIP {
filtered = append(filtered, c)
}
}
candidates = filtered
} else {
logger.Warn("could not identify orchestrator node, selecting from all nodes")
}

if len(candidates) == 0 {
logger.Fatal("no eligible nodes after excluding orchestrator")
}
}

selected := candidates[0]
fmt.Println(selected.ID)
},
}

func init() {
chaosCmd.AddCommand(chaosPickNodeCmd)

chaosPickNodeCmd.Flags().Bool("orchestrator", false, "Select the orchestrator node instead of a non-orchestrator")
}
30 changes: 30 additions & 0 deletions cmd/cluster-settings-disable-autofailover.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package cmd

import (
"github.com/spf13/cobra"
"go.uber.org/zap"
)

var clusterSettingsDisableAutoFailoverCmd = &cobra.Command{
Use: "disable-autofailover <cluster-id>",
Short: "Disables auto-failover for a cluster",
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
helper := CmdHelper{}
logger := helper.GetLogger()
ctx := helper.GetContext()

clusterId := args[0]

_, deployer, cluster := helper.IdentifyCluster(ctx, clusterId)

err := deployer.SetAutoFailover(ctx, cluster.GetID(), false, 0)
if err != nil {
logger.Fatal("failed to disable auto-failover", zap.Error(err))
}
},
}

func init() {
clusterSettingsCmd.AddCommand(clusterSettingsDisableAutoFailoverCmd)
}
33 changes: 33 additions & 0 deletions cmd/cluster-settings-enable-autofailover.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package cmd

import (
"github.com/spf13/cobra"
"go.uber.org/zap"
)

var clusterSettingsEnableAutoFailoverCmd = &cobra.Command{
Use: "enable-autofailover <cluster-id>",
Short: "Enables auto-failover for a cluster",
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
helper := CmdHelper{}
logger := helper.GetLogger()
ctx := helper.GetContext()

clusterId := args[0]
timeout, _ := cmd.Flags().GetInt("timeout")

_, deployer, cluster := helper.IdentifyCluster(ctx, clusterId)

err := deployer.SetAutoFailover(ctx, cluster.GetID(), true, timeout)
if err != nil {
logger.Fatal("failed to enable auto-failover", zap.Error(err))
}
},
}

func init() {
clusterSettingsCmd.AddCommand(clusterSettingsEnableAutoFailoverCmd)

clusterSettingsEnableAutoFailoverCmd.Flags().Int("timeout", 120, "Auto-failover timeout in seconds")
}
15 changes: 15 additions & 0 deletions cmd/cluster-settings.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package cmd

import (
"github.com/spf13/cobra"
)

var clusterSettingsCmd = &cobra.Command{
Use: "cluster-settings",
Short: "Provides cluster settings management tools",
Run: nil,
}

func init() {
rootCmd.AddCommand(clusterSettingsCmd)
}
3 changes: 1 addition & 2 deletions cmd/users-add.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package cmd

import (
"fmt"
"time"

"github.com/couchbaselabs/cbdinocluster/deployment"
Expand Down Expand Up @@ -46,7 +45,7 @@ var usersAddCmd = &cobra.Command{
for {
users, err := deployer.ListUsers(ctx, cluster.GetID())
if err != nil {
logger.Fatal(fmt.Sprintf("failed to wait for user to be ready: %w", err))
logger.Fatal("failed to wait for user to be ready", zap.Error(err))
}

for _, user := range users {
Expand Down
4 changes: 4 additions & 0 deletions deployment/caodeploy/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -788,3 +788,7 @@ func (d *Deployer) RebalanceCluster(ctx context.Context, clusterID string, nodes
func (d *Deployer) KillCouchbase(ctx context.Context, clusterID string, nodes []string) error {
return errors.New("caodeploy does not support killing couchbase process")
}

func (d *Deployer) SetAutoFailover(ctx context.Context, clusterID string, enabled bool, timeout int) error {
return errors.New("caodeploy does not support setting auto-failover")
}
4 changes: 4 additions & 0 deletions deployment/clouddeploy/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -2503,3 +2503,7 @@ func (d *Deployer) RebalanceCluster(ctx context.Context, clusterID string, nodes
func (d *Deployer) KillCouchbase(ctx context.Context, clusterID string, nodeIDs []string) error {
return errors.New("clouddeploy does not support killing couchbase process")
}

func (d *Deployer) SetAutoFailover(ctx context.Context, clusterID string, enabled bool, timeout int) error {
return errors.New("clouddeploy does not support setting auto-failover")
}
1 change: 1 addition & 0 deletions deployment/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,5 @@ type Deployer interface {
DropLink(ctx context.Context, columnarID, linkName string) error
EnableDataApi(ctx context.Context, clusterID string) error
KillCouchbase(ctx context.Context, clusterID string, nodes []string) error
SetAutoFailover(ctx context.Context, clusterID string, enabled bool, timeout int) error
}
12 changes: 12 additions & 0 deletions deployment/dockerdeploy/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -1399,6 +1399,18 @@ func (d *Deployer) RedeployCluster(ctx context.Context, clusterID string) error
return errors.New("docker deploy does not support redeploy cluster")
}

func (d *Deployer) SetAutoFailover(ctx context.Context, clusterID string, enabled bool, timeout int) error {
controller, err := d.getController(ctx, clusterID)
if err != nil {
return errors.Wrap(err, "failed to get controller for cluster")
}

return controller.Controller().SetAutoFailover(ctx, &clustercontrol.SetAutoFailoverOptions{
Enabled: enabled,
Timeout: timeout,
})
}

func (d *Deployer) CreateCapellaLink(ctx context.Context, columnarID, linkName, clusterId, directID string) error {
return errors.New("docker deploy does not support create capella link")
}
Expand Down
4 changes: 4 additions & 0 deletions deployment/localdeploy/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,7 @@ func (d *Deployer) RebalanceCluster(ctx context.Context, clusterID string, nodes
func (d *Deployer) KillCouchbase(ctx context.Context, clusterID string, nodes []string) error {
return errors.New("localdeploy does not support killing couchbase process")
}

func (d *Deployer) SetAutoFailover(ctx context.Context, clusterID string, enabled bool, timeout int) error {
return errors.New("localdeploy does not support setting auto-failover")
}
14 changes: 14 additions & 0 deletions utils/clustercontrol/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,20 @@ func (c *Controller) SetAppTelemetry(ctx context.Context, opts *AppTelemetryOpti
return c.doFormPost(ctx, "/settings/appTelemetry", form, true, nil)
}

type SetAutoFailoverOptions struct {
Enabled bool
Timeout int
}

func (c *Controller) SetAutoFailover(ctx context.Context, opts *SetAutoFailoverOptions) error {
Comment thread
brett19 marked this conversation as resolved.
form := make(url.Values)
form.Add("enabled", strconv.FormatBool(opts.Enabled))
if opts.Timeout > 0 {
form.Add("timeout", strconv.Itoa(opts.Timeout))
}
Comment thread
brett19 marked this conversation as resolved.
return c.doFormPost(ctx, "/settings/autoFailover", form, true, nil)
}

func (c *Controller) GetMetrics(ctx context.Context) (string, error) {
var resp []byte
err := c.doGet(ctx, "/metrics", &resp)
Expand Down
Loading