Skip to content

Commit 7157fce

Browse files
authored
Merge pull request #1312 from zimnx/mz/replace-node-uuid
Replace ScyllaCluster nodes using Host ID
2 parents fac3da6 + 98f603f commit 7157fce

File tree

374 files changed

+51564
-189
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

374 files changed

+51564
-189
lines changed

examples/common/operator.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2965,7 +2965,7 @@ spec:
29652965
replace_address_first_boot:
29662966
additionalProperties:
29672967
type: string
2968-
description: replace_address_first_boot holds addresses which should be replaced by new nodes.
2968+
description: 'replace_address_first_boot holds addresses which should be replaced by new nodes. DEPRECATED: since Scylla Operator 1.10 it''s only used for deprecated replace node procedure (ScyllaDB OS <5.2, Enterprise <2023.1). With Scylla Operator 1.11+ this field may be empty.'
29692969
type: object
29702970
stale:
29712971
description: stale indicates if the current rack status is collected for a previous generation. stale should eventually become false when the appropriate controller writes a fresh status.

pkg/api/scylla/v1/scylla.scylladb.com_scyllaclusters.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2009,7 +2009,7 @@ spec:
20092009
replace_address_first_boot:
20102010
additionalProperties:
20112011
type: string
2012-
description: replace_address_first_boot holds addresses which should be replaced by new nodes.
2012+
description: 'replace_address_first_boot holds addresses which should be replaced by new nodes. DEPRECATED: since Scylla Operator 1.10 it''s only used for deprecated replace node procedure (ScyllaDB OS <5.2, Enterprise <2023.1). With Scylla Operator 1.11+ this field may be empty.'
20132013
type: object
20142014
stale:
20152015
description: stale indicates if the current rack status is collected for a previous generation. stale should eventually become false when the appropriate controller writes a fresh status.

pkg/api/scylla/v1/types_cluster.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -493,10 +493,9 @@ type RackStatus struct {
493493
// conditions are the latest available observations of a rack's state.
494494
Conditions []RackCondition `json:"conditions,omitempty"`
495495

496-
// FIXME: The json value should have been a camelCase string.
497-
// We need to deprecate this value and introduce a new one.
498-
499496
// replace_address_first_boot holds addresses which should be replaced by new nodes.
497+
// DEPRECATED: since Scylla Operator 1.10 it's only used for deprecated replace node procedure (ScyllaDB OS <5.2, Enterprise <2023.1).
498+
// With Scylla Operator 1.11+ this field may be empty.
500499
ReplaceAddressFirstBoot map[string]string `json:"replace_address_first_boot,omitempty"`
501500
}
502501

pkg/cmd/tests/tests_run.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,15 @@ var suites = ginkgotest.TestSuites{
4949
Description: templates.LongDesc(`
5050
Runs all tests.
5151
`),
52-
DefaultParallelism: 15,
52+
DefaultParallelism: 8,
5353
},
5454
{
5555
Name: "scylla-operator/conformance/parallel",
5656
Description: templates.LongDesc(`
5757
Tests that ensure an Scylla Operator is working properly.
5858
`),
5959
LabelFilter: fmt.Sprintf("!%s", framework.SerialLabelName),
60-
DefaultParallelism: 15,
60+
DefaultParallelism: 8,
6161
},
6262
{
6363
Name: "scylla-operator/conformance/serial",

pkg/controller/scyllacluster/sync_services.go

Lines changed: 392 additions & 174 deletions
Large diffs are not rendered by default.

pkg/naming/constants.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ const (
1919
// ReplaceLabel express the intent to replace pod under the specific member.
2020
ReplaceLabel = "scylla/replace"
2121

22+
// ReplacingNodeHostIDLabel contains the Host ID of node labelled node is replacing.
23+
ReplacingNodeHostIDLabel = "internal.scylla-operator.scylladb.com/replacing-node-hostid"
24+
2225
// NodeMaintenanceLabel means that node is under maintenance.
2326
// Readiness check will always fail when this label is added to member service.
2427
NodeMaintenanceLabel = "scylla/node-maintenance"

pkg/scyllaclient/config_client.go

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
// Copyright (c) 2023 ScyllaDB.
2+
3+
package scyllaclient
4+
5+
import (
6+
"context"
7+
"fmt"
8+
"net"
9+
"net/http"
10+
"time"
11+
12+
api "github.com/go-openapi/runtime/client"
13+
"github.com/go-openapi/strfmt"
14+
"github.com/scylladb/scylla-operator/pkg/auth"
15+
scyllaclient "github.com/scylladb/scylladb-swagger-go-client/scylladb/gen/v2/client"
16+
"github.com/scylladb/scylladb-swagger-go-client/scylladb/gen/v2/client/config"
17+
)
18+
19+
const (
20+
agentPort = "10001"
21+
defaultTimeout = 30 * time.Second
22+
)
23+
24+
type ConfigClient struct {
25+
client *scyllaclient.ScylladbV2
26+
}
27+
28+
func NewConfigClient(host, authToken string) *ConfigClient {
29+
var transport http.RoundTripper = DefaultTransport()
30+
transport = fixContentType(transport)
31+
transport = auth.AddToken(transport, authToken)
32+
33+
client := &http.Client{
34+
Timeout: defaultTimeout,
35+
Transport: transport,
36+
}
37+
38+
host = net.JoinHostPort(host, agentPort)
39+
40+
scyllaV2Runtime := api.NewWithClient(
41+
host, scyllaclient.DefaultBasePath, scyllaclient.DefaultSchemes, client,
42+
)
43+
44+
return &ConfigClient{
45+
client: scyllaclient.New(scyllaV2Runtime, strfmt.Default),
46+
}
47+
}
48+
49+
func (c *ConfigClient) BroadcastAddress(ctx context.Context) (string, error) {
50+
resp, err := c.client.Config.FindConfigBroadcastAddress(config.NewFindConfigBroadcastAddressParamsWithContext(ctx))
51+
if err != nil {
52+
return "", fmt.Errorf("can't get broadcast_address: %w", err)
53+
}
54+
return resp.Payload, nil
55+
}
56+
57+
// ReplaceAddressFirstBoot returns value of "replace_address_first_boot" config parameter.
58+
func (c *ConfigClient) ReplaceAddressFirstBoot(ctx context.Context) (string, error) {
59+
resp, err := c.client.Config.FindConfigReplaceAddressFirstBoot(config.NewFindConfigReplaceAddressFirstBootParamsWithContext(ctx))
60+
if err != nil {
61+
return "", fmt.Errorf("can't get replace_address_first_boot: %w", err)
62+
}
63+
return resp.Payload, nil
64+
}
65+
66+
// ReplaceNodeFirstBoot returns value of "replace_node_first_boot" config parameter.
67+
func (c *ConfigClient) ReplaceNodeFirstBoot(ctx context.Context) (string, error) {
68+
resp, err := c.client.Config.FindConfigReplaceNodeFirstBoot(config.NewFindConfigReplaceNodeFirstBootParamsWithContext(ctx))
69+
if err != nil {
70+
return "", fmt.Errorf("can't get replace_node_first_boot: %w", err)
71+
}
72+
return resp.Payload, nil
73+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Copyright (c) 2023 ScyllaDB.
2+
3+
package scyllafeatures
4+
5+
import (
6+
"fmt"
7+
8+
"github.com/blang/semver"
9+
scyllav1 "github.com/scylladb/scylla-operator/pkg/api/scylla/v1"
10+
)
11+
12+
var (
13+
scyllaEnterpriseMinimalVersion = semver.MustParse("2000.0.0")
14+
)
15+
16+
type ScyllaFeature string
17+
18+
const (
19+
ReplacingNodeUsingHostID ScyllaFeature = "ReplacingNodeUsingHostID"
20+
)
21+
22+
type scyllaDBVersionMinimalConstraint struct {
23+
openSource semver.Version
24+
enterprise semver.Version
25+
}
26+
27+
var featureMinimalVersionConstraints = map[ScyllaFeature]scyllaDBVersionMinimalConstraint{
28+
ReplacingNodeUsingHostID: {
29+
openSource: semver.MustParse("5.2.0"),
30+
enterprise: semver.MustParse("2023.1.0"),
31+
},
32+
}
33+
34+
func Supports(sc *scyllav1.ScyllaCluster, feature ScyllaFeature) (bool, error) {
35+
constraints, ok := featureMinimalVersionConstraints[feature]
36+
if !ok {
37+
return false, fmt.Errorf("unable to find minimal version constraints, unknown feature %q", feature)
38+
}
39+
40+
version, err := semver.Parse(sc.Spec.Version)
41+
if err != nil {
42+
return false, fmt.Errorf("can't parse ScyllaCluster version %q: %w", sc.Spec.Version, err)
43+
}
44+
45+
if isOpenSource(version) && version.GTE(constraints.openSource) {
46+
return true, nil
47+
}
48+
49+
if isEnterprise(version) && version.GTE(constraints.enterprise) {
50+
return true, nil
51+
}
52+
53+
return false, nil
54+
}
55+
56+
func isEnterprise(v semver.Version) bool {
57+
return v.GTE(scyllaEnterpriseMinimalVersion)
58+
}
59+
60+
func isOpenSource(v semver.Version) bool {
61+
return v.LT(scyllaEnterpriseMinimalVersion)
62+
}

pkg/sidecar/config/config.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,20 @@ func (s *ScyllaConfig) setupEntrypoint(ctx context.Context) (*exec.Cmd, error) {
246246
}
247247
// If node is being replaced
248248
if addr, ok := m.ServiceLabels[naming.ReplaceLabel]; ok {
249-
args["replace-address-first-boot"] = pointer.StringPtr(addr)
249+
if len(addr) == 0 {
250+
klog.Warningf("Service %q have unexpectedly empty label %q, skipping replace", m.Name, naming.ReplaceLabel)
251+
} else {
252+
args["replace-address-first-boot"] = pointer.StringPtr(addr)
253+
}
254+
}
255+
if hostID, ok := m.ServiceLabels[naming.ReplacingNodeHostIDLabel]; ok {
256+
if len(hostID) == 0 {
257+
klog.Warningf("Service %q have unexpectedly empty label %q, skipping replace", m.Name, naming.ReplacingNodeHostIDLabel)
258+
} else {
259+
args["replace-node-first-boot"] = pointer.String(hostID)
260+
}
250261
}
262+
251263
// See if we need to use cpu-pinning
252264
// TODO: Add more checks to make sure this is valid.
253265
// eg. parse the cpuset and check the number of cpus is the same as cpu limits

test/e2e/set/scyllacluster/scyllacluster_replace.go

Lines changed: 91 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
g "github.com/onsi/ginkgo/v2"
1111
o "github.com/onsi/gomega"
1212
"github.com/scylladb/scylla-operator/pkg/naming"
13+
"github.com/scylladb/scylla-operator/pkg/scyllaclient"
1314
scyllafixture "github.com/scylladb/scylla-operator/test/e2e/fixture/scylla"
1415
"github.com/scylladb/scylla-operator/test/e2e/framework"
1516
"github.com/scylladb/scylla-operator/test/e2e/utils"
@@ -18,17 +19,61 @@ import (
1819
"k8s.io/apimachinery/pkg/types"
1920
)
2021

21-
var _ = g.Describe("ScyllaCluster replace", func() {
22+
var _ = g.Describe("ScyllaCluster", func() {
2223
defer g.GinkgoRecover()
2324

2425
f := framework.NewFramework("scyllacluster")
2526

26-
g.It("should replace a node", func() {
27+
const (
28+
scyllaOSImageRepository = "docker.io/scylladb/scylla"
29+
scyllaEnterpriseImageRepository = "docker.io/scylladb/scylla-enterprise"
30+
)
31+
32+
validateReplaceViaClusterIPAddress := func(ctx context.Context, configClient *scyllaclient.ConfigClient, preReplaceService *corev1.Service) error {
33+
replaceAddressFirstBoot, err := configClient.ReplaceAddressFirstBoot(ctx)
34+
if err != nil {
35+
return fmt.Errorf("can't get replace_address_first_boot config parameter: %w", err)
36+
}
37+
38+
if replaceAddressFirstBoot != preReplaceService.Spec.ClusterIP {
39+
return fmt.Errorf("unexpected value of replace_address_first_boot scylla config, expected %q, got %q", preReplaceService.Spec.ClusterIP, replaceAddressFirstBoot)
40+
}
41+
42+
return nil
43+
}
44+
45+
validateReplaceViaHostID := func(ctx context.Context, configClient *scyllaclient.ConfigClient, preReplaceService *corev1.Service) error {
46+
replaceNodeFirstBoot, err := configClient.ReplaceNodeFirstBoot(ctx)
47+
if err != nil {
48+
return fmt.Errorf("can't get replace_node_first_boot config parameter: %w", err)
49+
}
50+
51+
if replaceNodeFirstBoot != preReplaceService.Annotations[naming.HostIDAnnotation] {
52+
return fmt.Errorf("unexpected value of replace_node_first_boot scylla config, expected %q, got %q", preReplaceService.Annotations[naming.HostIDAnnotation], replaceNodeFirstBoot)
53+
}
54+
55+
return nil
56+
}
57+
58+
type entry struct {
59+
procedure string
60+
scyllaImageRepository string
61+
scyllaVersion string
62+
validateScyllaConfig func(context.Context, *scyllaclient.ConfigClient, *corev1.Service) error
63+
}
64+
65+
describeEntry := func(e *entry) string {
66+
return fmt.Sprintf(`using %s based procedure when version of ScyllaDB is "%s:%s"`, e.procedure, e.scyllaImageRepository, e.scyllaVersion)
67+
}
68+
69+
g.DescribeTable("should replace a node", func(e *entry) {
2770
ctx, cancel := context.WithTimeout(context.Background(), testTimeout)
2871
defer cancel()
2972

3073
sc := scyllafixture.BasicScyllaCluster.ReadOrFail()
31-
sc.Spec.Datacenter.Racks[0].Members = 2
74+
sc.Spec.Repository = e.scyllaImageRepository
75+
sc.Spec.Version = e.scyllaVersion
76+
sc.Spec.Datacenter.Racks[0].Members = 3
3277

3378
framework.By("Creating a ScyllaCluster")
3479
sc, err := f.ScyllaClient().ScyllaV1().ScyllaClusters(f.Namespace()).Create(ctx, sc, metav1.CreateOptions{})
@@ -42,10 +87,15 @@ var _ = g.Describe("ScyllaCluster replace", func() {
4287

4388
verifyScyllaCluster(ctx, f.KubeClient(), sc)
4489
hosts := getScyllaHostsAndWaitForFullQuorum(ctx, f.KubeClient().CoreV1(), sc)
45-
o.Expect(hosts).To(o.HaveLen(2))
90+
o.Expect(hosts).To(o.HaveLen(int(utils.GetMemberCount(sc))))
4691
di := insertAndVerifyCQLData(ctx, hosts)
4792
defer di.Close()
4893

94+
replacedNodeService, err := f.KubeClient().CoreV1().Services(sc.Namespace).Get(ctx, utils.GetNodeName(sc, 0), metav1.GetOptions{})
95+
o.Expect(err).NotTo(o.HaveOccurred())
96+
97+
preReplaceService := replacedNodeService.DeepCopy()
98+
4999
framework.By("Replacing a node #0")
50100
pod, err := f.KubeClient().CoreV1().Pods(f.Namespace()).Get(
51101
ctx,
@@ -91,7 +141,7 @@ var _ = g.Describe("ScyllaCluster replace", func() {
91141
client, _, err := utils.GetScyllaClient(ctx, f.KubeClient().CoreV1(), sc)
92142
o.Expect(err).NotTo(o.HaveOccurred())
93143

94-
replacedNodeService, err := f.KubeClient().CoreV1().Services(sc.Namespace).Get(ctx, utils.GetNodeName(sc, 0), metav1.GetOptions{})
144+
replacedNodeService, err = f.KubeClient().CoreV1().Services(sc.Namespace).Get(ctx, utils.GetNodeName(sc, 0), metav1.GetOptions{})
95145
o.Expect(err).NotTo(o.HaveOccurred())
96146

97147
otherNodeService, err := f.KubeClient().CoreV1().Services(sc.Namespace).Get(ctx, utils.GetNodeName(sc, 1), metav1.GetOptions{})
@@ -106,9 +156,43 @@ var _ = g.Describe("ScyllaCluster replace", func() {
106156
oldHosts := hosts
107157
hosts = getScyllaHostsAndWaitForFullQuorum(ctx, f.KubeClient().CoreV1(), sc)
108158
o.Expect(hosts).To(o.HaveLen(len(oldHosts)))
109-
o.Expect(hosts).NotTo(o.ConsistOf(oldHosts))
110159
err = di.SetClientEndpoints(hosts)
111160
o.Expect(err).NotTo(o.HaveOccurred())
112161
verifyCQLData(ctx, di)
113-
})
162+
163+
framework.By("Verifying ScyllaDB config")
164+
165+
configClient, err := utils.GetScyllaConfigClient(ctx, f.KubeClient().CoreV1(), sc, replacedNodeService.Spec.ClusterIP)
166+
o.Expect(err).NotTo(o.HaveOccurred())
167+
168+
err = e.validateScyllaConfig(ctx, configClient, preReplaceService)
169+
o.Expect(err).NotTo(o.HaveOccurred())
170+
},
171+
g.Entry(describeEntry, &entry{
172+
procedure: "ClusterIP",
173+
scyllaImageRepository: scyllaOSImageRepository,
174+
scyllaVersion: "5.1.15",
175+
validateScyllaConfig: validateReplaceViaClusterIPAddress,
176+
}),
177+
g.Entry(describeEntry, &entry{
178+
procedure: "ClusterIP",
179+
scyllaImageRepository: scyllaEnterpriseImageRepository,
180+
scyllaVersion: "2022.2.12",
181+
validateScyllaConfig: validateReplaceViaClusterIPAddress,
182+
}),
183+
g.Entry(describeEntry, &entry{
184+
procedure: "HostID",
185+
scyllaImageRepository: scyllaOSImageRepository,
186+
scyllaVersion: "5.2.0",
187+
validateScyllaConfig: validateReplaceViaHostID,
188+
}),
189+
// TODO: Enable test when ScyllaDB Enterprise 2023.1 is released
190+
// Ref: https://github.com/scylladb/scylla-operator/issues/1325
191+
g.PEntry(describeEntry, &entry{
192+
procedure: "HostID",
193+
scyllaImageRepository: scyllaEnterpriseImageRepository,
194+
scyllaVersion: "2023.1.0",
195+
validateScyllaConfig: validateReplaceViaHostID,
196+
}),
197+
)
114198
})

0 commit comments

Comments
 (0)