-
Notifications
You must be signed in to change notification settings - Fork 213
Issue#398 that decreasing replicas will make zookeeper unrecoverable when zookeeper not running. #406
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Issue#398 that decreasing replicas will make zookeeper unrecoverable when zookeeper not running. #406
Changes from 7 commits
607f481
42857ce
204a8f6
573a615
b361a18
506279f
d88591f
d2b4b15
1739d5c
e66754a
5e0f202
1a6ae84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ import ( | |
| "context" | ||
| "fmt" | ||
| "strconv" | ||
| "strings" | ||
| "time" | ||
|
|
||
| "github.com/operator-framework/operator-sdk/pkg/predicate" | ||
|
|
@@ -279,6 +280,8 @@ func (r *ReconcileZookeeperCluster) reconcileStatefulSet(instance *zookeeperv1be | |
| foundSTSSize := *foundSts.Spec.Replicas | ||
| newSTSSize := *sts.Spec.Replicas | ||
| if newSTSSize != foundSTSSize { | ||
| // If zookeeper is not running, it must stop update replicas. | ||
| // Until zookeeper is running and the client connect it successfully, decreasing Replicas will take effect. | ||
| zkUri := utils.GetZkServiceUri(instance) | ||
| err = r.zkClient.Connect(zkUri) | ||
| if err != nil { | ||
|
|
@@ -295,7 +298,31 @@ func (r *ReconcileZookeeperCluster) reconcileStatefulSet(instance *zookeeperv1be | |
|
|
||
| data := "CLUSTER_SIZE=" + strconv.Itoa(int(newSTSSize)) | ||
| r.log.Info("Updating Cluster Size.", "New Data:", data, "Version", version) | ||
| r.zkClient.UpdateNode(path, data, version) | ||
| err = r.zkClient.UpdateNode(path, data, version) | ||
| if err != nil { | ||
| return fmt.Errorf("Error updating cluster size %s: %v", path, err) | ||
| } | ||
| // #398 if decrease node, remove node immediately after updating node successfully. | ||
| if newSTSSize < foundSTSSize { | ||
| var removes []string | ||
| config, _, err := r.zkClient.GetConfig() | ||
| if err != nil { | ||
| return fmt.Errorf("Error GetConfig %v", err) | ||
| } | ||
| r.log.Info("Get zookeeper config.", "Config: ", config) | ||
| for myid := newSTSSize + 1; myid <= foundSTSSize; myid++ { | ||
| if strings.Contains(config, "server."+strconv.Itoa(int(myid))+"=") { | ||
| removes = append(removes, strconv.Itoa(int(myid))) | ||
| } | ||
| } | ||
| // The node that have been removed witch reconfig alse can still provide services for all online clients. | ||
| // So We can remove it firstly, it will avoid to error that client maybe can't connect to server on preStop. | ||
| r.log.Info("Do reconfig to remove node.", "Remove ids", strings.Join(removes, ",")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The check above returns error at line no: 303 ensures zookeeper is running.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's no matter removing node on teardown script, do it or not will not affect cluster.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @stop-coding Did you see I think @anishakj is suggesting that catching the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jkhalack Sorry for my late reply. It known that updating "Spec.Replicas" to k8s will tell pod to create or exit. If it fail on scale down, we can stop updating "Spec.Replicas" until cluster recovery, that will ensure have done reconfigure before pod exit. So I think that doing reconfigure on checking the cluster scale down is better. |
||
| err = r.zkClient.IncReconfig(nil, removes, -1) | ||
| if err != nil { | ||
| return fmt.Errorf("Error reconfig remove id:%s, %v", strings.Join(removes, ","), err) | ||
| } | ||
| } | ||
| } | ||
| err = r.updateStatefulSet(instance, foundSts, sts) | ||
| if err != nil { | ||
|
|
@@ -615,7 +642,8 @@ func (r *ReconcileZookeeperCluster) reconcileClusterStatus(instance *zookeeperv1 | |
| instance.Status.Members.Ready = readyMembers | ||
| instance.Status.Members.Unready = unreadyMembers | ||
|
|
||
| //If Cluster is in a ready state... | ||
| // If Cluster is in a ready state... | ||
| // instance.Spec.Replicas is just an expected value that we set it, but it maybe not take effect by k8s. | ||
| if instance.Spec.Replicas == instance.Status.ReadyReplicas && (!instance.Status.MetaRootCreated) { | ||
| r.log.Info("Cluster is Ready, Creating ZK Metadata...") | ||
| zkUri := utils.GetZkServiceUri(instance) | ||
|
|
@@ -635,7 +663,7 @@ func (r *ReconcileZookeeperCluster) reconcileClusterStatus(instance *zookeeperv1 | |
| r.log.Info("Updating zookeeper status", | ||
| "StatefulSet.Namespace", instance.Namespace, | ||
| "StatefulSet.Name", instance.Name) | ||
| if instance.Status.ReadyReplicas == instance.Spec.Replicas { | ||
| if instance.Status.ReadyReplicas == instance.Spec.Replicas && instance.Status.Replicas == instance.Spec.Replicas { | ||
| instance.Status.SetPodsReadyConditionTrue() | ||
| } else { | ||
| instance.Status.SetPodsReadyConditionFalse() | ||
|
|
@@ -765,7 +793,9 @@ func (r *ReconcileZookeeperCluster) getPVCCount(instance *zookeeperv1beta1.Zooke | |
|
|
||
| func (r *ReconcileZookeeperCluster) cleanupOrphanPVCs(instance *zookeeperv1beta1.ZookeeperCluster) (err error) { | ||
| // this check should make sure we do not delete the PVCs before the STS has scaled down | ||
| if instance.Status.ReadyReplicas == instance.Spec.Replicas { | ||
| // instance.Spec.Replicas is just an expected value that we set it, but it maybe not take effect by k8s if update state failly. | ||
| // So we should check that instance.Status.Replicas is equal to ReadyReplicas and Spec.Replicas, which means cluster already. | ||
| if instance.Status.ReadyReplicas == instance.Spec.Replicas && instance.Status.Replicas == instance.Spec.Replicas { | ||
| pvcCount, err := r.getPVCCount(instance) | ||
| if err != nil { | ||
| return err | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.