Skip to content

Commit 1272009

Browse files
committed
SDKQE-3671: Add failover node commands to docker deployer
New commands: - 'nodes-failover' with hard/graceful options - 'nodes-failover-recover' with recovery type options - 'rebalance' to rebalance cluster with optional nodes to eject
1 parent 465afa3 commit 1272009

9 files changed

Lines changed: 316 additions & 0 deletions

File tree

cmd/nodes-failover.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
package cmd
2+
3+
import (
4+
"github.com/couchbaselabs/cbdinocluster/deployment"
5+
"github.com/spf13/cobra"
6+
"go.uber.org/zap"
7+
"strings"
8+
)
9+
10+
var nodesFailoverCmd = &cobra.Command{
11+
Use: "failover <cluster-id> <node-id-or-ip>",
12+
Short: "Failover a node in the cluster",
13+
Args: cobra.MinimumNArgs(1),
14+
Run: func(cmd *cobra.Command, args []string) {
15+
helper := CmdHelper{}
16+
logger := helper.GetLogger()
17+
ctx := helper.GetContext()
18+
19+
failOverTypeStr, _ := cmd.Flags().GetString("type")
20+
allowUnsafe, _ := cmd.Flags().GetBool("allow-unsafe")
21+
22+
_, deployer, cluster := helper.IdentifyCluster(ctx, args[0])
23+
node := helper.IdentifyNode(ctx, cluster, args[1])
24+
25+
var failOverType deployment.FailOverType
26+
switch strings.ToLower(failOverTypeStr) {
27+
case "hard":
28+
failOverType = deployment.HardFailOver
29+
case "graceful":
30+
failOverType = deployment.GracefulFailOver
31+
default:
32+
logger.Fatal("unexpected fail over type",
33+
zap.String("type", failOverTypeStr))
34+
}
35+
36+
err := deployer.FailOverNode(ctx, cluster.GetID(), node.GetID(), failOverType, allowUnsafe)
37+
if err != nil {
38+
logger.Fatal("failed to fail over node", zap.Error(err))
39+
}
40+
},
41+
}
42+
43+
func init() {
44+
nodesCmd.AddCommand(nodesFailoverCmd)
45+
nodesFailoverCmd.Flags().String("type", "", "the type of failover [hard|graceful]")
46+
nodesFailoverCmd.Flags().Bool("allow-unsafe", false, "allow unsafe failover (for hard failover only)")
47+
}

cmd/nodes-set-recovery.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package cmd
2+
3+
import (
4+
"github.com/couchbaselabs/cbdinocluster/deployment"
5+
"github.com/spf13/cobra"
6+
"go.uber.org/zap"
7+
"strings"
8+
)
9+
10+
var nodesSetRecoveryCmd = &cobra.Command{
11+
Use: "set-recovery <cluster-id> <node-id-or-ip>",
12+
Short: "Set the recovery type for a node in the cluster",
13+
Args: cobra.MinimumNArgs(1),
14+
Run: func(cmd *cobra.Command, args []string) {
15+
helper := CmdHelper{}
16+
logger := helper.GetLogger()
17+
ctx := helper.GetContext()
18+
19+
recoveryTypeStr, _ := cmd.Flags().GetString("type")
20+
21+
_, deployer, cluster := helper.IdentifyCluster(ctx, args[0])
22+
node := helper.IdentifyNode(ctx, cluster, args[1])
23+
24+
var recoveryType deployment.RecoveryType
25+
switch strings.ToLower(recoveryTypeStr) {
26+
case "full":
27+
recoveryType = deployment.FullRecovery
28+
case "delta":
29+
recoveryType = deployment.DeltaRecovery
30+
default:
31+
logger.Fatal("unexpected recovery type",
32+
zap.String("type", recoveryTypeStr))
33+
}
34+
35+
err := deployer.SetNodeRecovery(ctx, cluster.GetID(), node.GetID(), recoveryType)
36+
if err != nil {
37+
logger.Fatal("failed to recovery type", zap.Error(err))
38+
}
39+
},
40+
}
41+
42+
func init() {
43+
nodesCmd.AddCommand(nodesSetRecoveryCmd)
44+
nodesSetRecoveryCmd.Flags().String("type", "", "the type of failover recovery [full|delta]")
45+
}

cmd/rebalance.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package cmd
2+
3+
import (
4+
"github.com/spf13/cobra"
5+
"go.uber.org/zap"
6+
)
7+
8+
var rebalanceCmd = &cobra.Command{
9+
Use: "rebalance <cluster-id> [<node-id-or-ip-to-eject> ...]",
10+
Short: "Rebalance the cluster, ejecting any specified nodes",
11+
Args: cobra.MinimumNArgs(1),
12+
Run: func(cmd *cobra.Command, args []string) {
13+
helper := CmdHelper{}
14+
logger := helper.GetLogger()
15+
ctx := helper.GetContext()
16+
17+
_, deployer, cluster := helper.IdentifyCluster(ctx, args[0])
18+
nodesToEject := args[1:]
19+
var nodeIds []string
20+
for _, nodeIdent := range nodesToEject {
21+
node := helper.IdentifyNode(ctx, cluster, nodeIdent)
22+
nodeIds = append(nodeIds, node.GetID())
23+
}
24+
25+
err := deployer.RebalanceCluster(ctx, cluster.GetID(), nodeIds)
26+
if err != nil {
27+
logger.Fatal("failed to rebalance cluster", zap.Error(err))
28+
}
29+
},
30+
}
31+
32+
func init() {
33+
rootCmd.AddCommand(rebalanceCmd)
34+
}

deployment/caodeploy/deployer.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -768,3 +768,15 @@ func (d *Deployer) UpgradeCluster(ctx context.Context, clusterID string, Current
768768
func (d *Deployer) EnableDataApi(ctx context.Context, clusterID string) error {
769769
return errors.New("caodeploy does not support enabling data api")
770770
}
771+
772+
func (d *Deployer) FailOverNode(ctx context.Context, clusterID string, nodeID string, failOverType deployment.FailOverType, allowUnsafe bool) error {
773+
return errors.New("caodeploy does not support failing over a node")
774+
}
775+
776+
func (d *Deployer) SetNodeRecovery(ctx context.Context, clusterID string, nodeID string, recoverType deployment.RecoveryType) error {
777+
return errors.New("caodeploy does not support failover recovery")
778+
}
779+
780+
func (d *Deployer) RebalanceCluster(ctx context.Context, clusterID string, nodesToEject []string) error {
781+
return errors.New("caodeploy does not support rebalance cluster")
782+
}

deployment/clouddeploy/deployer.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2467,3 +2467,15 @@ func (d *Deployer) PauseNode(ctx context.Context, clusterID string, nodeID strin
24672467
func (d *Deployer) UnpauseNode(ctx context.Context, clusterID string, nodeID string) error {
24682468
return errors.New("clouddeploy does not support node pausing")
24692469
}
2470+
2471+
func (d *Deployer) FailOverNode(ctx context.Context, clusterID string, nodeID string, failOverType deployment.FailOverType, allowUnsafe bool) error {
2472+
return errors.New("clouddeploy does not support failing over a node")
2473+
}
2474+
2475+
func (d *Deployer) SetNodeRecovery(ctx context.Context, clusterID string, nodeID string, recoverType deployment.RecoveryType) error {
2476+
return errors.New("clouddeploy does not support failover recovery")
2477+
}
2478+
2479+
func (d *Deployer) RebalanceCluster(ctx context.Context, clusterID string, nodesToEject []string) error {
2480+
return errors.New("clouddeploy does not support rebalance cluster")
2481+
}

deployment/deployer.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,20 @@ const (
9292
BlockNodeTrafficAll BlockNodeTrafficType = "all"
9393
)
9494

95+
type FailOverType string
96+
97+
const (
98+
HardFailOver FailOverType = "hard"
99+
GracefulFailOver FailOverType = "graceful"
100+
)
101+
102+
type RecoveryType string
103+
104+
const (
105+
FullRecovery RecoveryType = "full"
106+
DeltaRecovery RecoveryType = "delta"
107+
)
108+
95109
type Deployer interface {
96110
ListClusters(ctx context.Context) ([]ClusterInfo, error)
97111
NewCluster(ctx context.Context, def *clusterdef.Cluster) (ClusterInfo, error)
@@ -101,6 +115,9 @@ type Deployer interface {
101115
UpgradeCluster(ctx context.Context, clusterID string, CurrentImages string, NewImage string) error
102116
AddNode(ctx context.Context, clusterID string) (string, error)
103117
RemoveNode(ctx context.Context, clusterID string, nodeID string) error
118+
FailOverNode(ctx context.Context, clusterID string, nodeID string, failOverType FailOverType, allowUnsafe bool) error
119+
SetNodeRecovery(ctx context.Context, clusterID string, nodeID string, recoveryType RecoveryType) error
120+
RebalanceCluster(ctx context.Context, clusterID string, nodesToEject []string) error
104121
RemoveCluster(ctx context.Context, clusterID string) error
105122
RemoveAll(ctx context.Context) error
106123
Cleanup(ctx context.Context) error

deployment/dockerdeploy/deployer.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,109 @@ func (d *Deployer) UnpauseNode(ctx context.Context, clusterID string, nodeID str
11631163
return nil
11641164
}
11651165

1166+
func (d *Deployer) RebalanceCluster(ctx context.Context, clusterID string, nodeIDsToEject []string) error {
1167+
controller, err := d.getController(ctx, clusterID)
1168+
if err != nil {
1169+
return errors.Wrap(err, "failed to get controller for cluster")
1170+
}
1171+
1172+
var OTPs []string
1173+
for _, nodeID := range nodeIDsToEject {
1174+
otp, err := d.getNodeOTP(ctx, clusterID, nodeID)
1175+
if err != nil {
1176+
return errors.Wrap(err, "failed to get OTP for node")
1177+
}
1178+
OTPs = append(OTPs, otp)
1179+
}
1180+
1181+
return controller.Rebalance(ctx, OTPs)
1182+
}
1183+
1184+
func (d *Deployer) FailOverNode(ctx context.Context, clusterID string, nodeID string, failOverType deployment.FailOverType, allowUnsafe bool) error {
1185+
node, err := d.getNode(ctx, clusterID, nodeID)
1186+
if err != nil {
1187+
return errors.Wrap(err, "failed to get node")
1188+
}
1189+
controller, err := d.getController(ctx, clusterID)
1190+
if err != nil {
1191+
return errors.Wrap(err, "failed to get controller for cluster")
1192+
}
1193+
1194+
otp, err := d.getNodeOTP(ctx, clusterID, node.NodeID)
1195+
if err != nil {
1196+
return errors.Wrap(err, "failed to get OTP for node")
1197+
}
1198+
1199+
if failOverType == deployment.HardFailOver {
1200+
opts := &clustercontrol.HardFailOverOptions{
1201+
NodeOTPs: []string{otp},
1202+
AllowUnsafe: allowUnsafe,
1203+
}
1204+
err := controller.Controller().HardFailOver(ctx, opts)
1205+
if err != nil {
1206+
return errors.Wrap(err, "hard failover failed")
1207+
}
1208+
} else if failOverType == deployment.GracefulFailOver {
1209+
err := controller.Controller().GracefulFailOver(ctx, []string{otp})
1210+
if err != nil {
1211+
return errors.Wrap(err, "graceful failover start failed")
1212+
}
1213+
1214+
d.logger.Info("waiting for rebalance completion started by graceful failover")
1215+
1216+
err = controller.WaitForNoRunningTasks(ctx)
1217+
if err != nil {
1218+
return errors.Wrap(err, "failed to wait for tasks to complete")
1219+
}
1220+
}
1221+
return nil
1222+
}
1223+
1224+
func (d *Deployer) SetNodeRecovery(ctx context.Context, clusterID string, nodeID string, recoveryType deployment.RecoveryType) error {
1225+
node, err := d.getNode(ctx, clusterID, nodeID)
1226+
if err != nil {
1227+
return errors.Wrap(err, "failed to get node")
1228+
}
1229+
controller, err := d.getController(ctx, clusterID)
1230+
if err != nil {
1231+
return errors.Wrap(err, "failed to get cluster info")
1232+
}
1233+
1234+
otp, err := d.getNodeOTP(ctx, clusterID, node.NodeID)
1235+
if err != nil {
1236+
return errors.Wrap(err, "failed to get OTP for node")
1237+
}
1238+
1239+
opts := &clustercontrol.FailOverRecoveryType{
1240+
NodeOTPs: []string{otp},
1241+
RecoveryType: string(recoveryType),
1242+
}
1243+
1244+
err = controller.Controller().SetRecovery(ctx, opts)
1245+
if err != nil {
1246+
return errors.Wrap(err, "set recovery failed")
1247+
}
1248+
return nil
1249+
}
1250+
1251+
func (d *Deployer) getNodeOTP(ctx context.Context, clusterID string, nodeId string) (string, error) {
1252+
clusterInfo, err := d.getCluster(ctx, clusterID)
1253+
if err != nil {
1254+
return "", errors.Wrap(err, "failed to get cluster info")
1255+
}
1256+
clusterInfoEx, err := d.getClusterInfoEx(ctx, clusterInfo)
1257+
if err != nil {
1258+
return "", errors.Wrap(err, "failed to get extended cluster info")
1259+
}
1260+
1261+
for _, nodeEx := range clusterInfoEx.NodesEx {
1262+
if nodeId == nodeEx.NodeID {
1263+
return nodeEx.OTPNode, nil
1264+
}
1265+
}
1266+
return "", nil
1267+
}
1268+
11661269
func (d *Deployer) RedeployCluster(ctx context.Context, clusterID string) error {
11671270
return errors.New("docker deploy does not support redeploy cluster")
11681271
}

deployment/localdeploy/deployer.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,14 @@ func (d *Deployer) UpgradeCluster(ctx context.Context, clusterID string, Current
260260
func (d *Deployer) EnableDataApi(ctx context.Context, clusterID string) error {
261261
return errors.New("localdeploy does not support enabling data api")
262262
}
263+
264+
func (d *Deployer) FailOverNode(ctx context.Context, clusterID string, nodeID string, failOverType deployment.FailOverType, allowUnsafe bool) error {
265+
return errors.New("localdeploy does not support failing over a node")
266+
}
267+
func (d *Deployer) SetNodeRecovery(ctx context.Context, clusterID string, nodeID string, recoverType deployment.RecoveryType) error {
268+
return errors.New("localdeploy does not support failover recovery")
269+
}
270+
271+
func (d *Deployer) RebalanceCluster(ctx context.Context, clusterID string, nodesToEject []string) error {
272+
return errors.New("localdeploy does not support rebalance cluster")
273+
}

utils/clustercontrol/controller.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"io"
99
"net/http"
1010
"net/url"
11+
"strconv"
1112
"strings"
1213
"time"
1314

@@ -837,3 +838,37 @@ type DeleteTrustedCAOptions struct {
837838
func (c *Controller) DeleteTrustedCA(ctx context.Context, opts *DeleteTrustedCAOptions) error {
838839
return c.doDelete(ctx, fmt.Sprintf("/pools/default/trustedCAs/%d", opts.ID), nil)
839840
}
841+
842+
type HardFailOverOptions struct {
843+
NodeOTPs []string
844+
AllowUnsafe bool
845+
}
846+
847+
func (c *Controller) HardFailOver(ctx context.Context, opts *HardFailOverOptions) error {
848+
form := make(url.Values)
849+
form.Add("otpNode", strings.Join(opts.NodeOTPs, ","))
850+
if opts.AllowUnsafe != false { //false is default
851+
form.Add("allowUnsafe", strconv.FormatBool(opts.AllowUnsafe))
852+
}
853+
854+
return c.doFormPost(ctx, "/controller/failOver", form, true, nil)
855+
}
856+
857+
func (c *Controller) GracefulFailOver(ctx context.Context, nodeOTPs []string) error {
858+
form := make(url.Values)
859+
form.Add("otpNode", strings.Join(nodeOTPs, ","))
860+
return c.doFormPost(ctx, "/controller/startGracefulFailover", form, true, nil)
861+
}
862+
863+
type FailOverRecoveryType struct {
864+
NodeOTPs []string
865+
RecoveryType string
866+
}
867+
868+
func (c *Controller) SetRecovery(ctx context.Context, opts *FailOverRecoveryType) error {
869+
form := make(url.Values)
870+
form.Add("otpNode", strings.Join(opts.NodeOTPs, ","))
871+
form.Add("recoveryType", opts.RecoveryType)
872+
873+
return c.doFormPost(ctx, "/controller/setRecoveryType", form, true, nil)
874+
}

0 commit comments

Comments
 (0)