Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
7a0a92b
Add cluster status monitor to controller
weyfonk Oct 3, 2024
9e364db
Reflect offline cluster state in more bundle deployment status fields
weyfonk Oct 4, 2024
3599997
Move cluster status monitor to separate package
weyfonk Oct 7, 2024
a6938fe
Set sensible configuration for cluster status monitor
weyfonk Oct 7, 2024
38ee90a
Eliminate linting errors
weyfonk Oct 7, 2024
02937c1
Check for condition status and reason
weyfonk Oct 8, 2024
e1f3998
Prevent agent check-in interval from being 0
weyfonk Oct 14, 2024
1224333
Prevent check-in interval from being 0 when importing cluster
weyfonk Oct 14, 2024
9bc99d8
Make cluster monitor interval and threshold configurable
weyfonk Oct 14, 2024
1ad8b3b
Skip bundle deployment updates for already offline clusters
weyfonk Oct 18, 2024
0e53496
Set Ready condition to Unknown for offline clusters
weyfonk Oct 18, 2024
6fbe68e
Fix json attribute for cluster monitor interval
weyfonk Oct 18, 2024
6c83845
Run cluster status monitor on unsharded controller only
weyfonk Oct 18, 2024
c22a760
Add agent check-in interval to agent install tests
weyfonk Oct 21, 2024
5a2c6a3
Fix linting errors
weyfonk Jan 21, 2026
3784896
Use feature flag for enabling cluster monitor
weyfonk Mar 19, 2026
5299b88
Add threshold to cluster status check log message
weyfonk Mar 19, 2026
4d523c2
Mark Cluster as Offline when its bundle deployments are offline
weyfonk Mar 20, 2026
ba25110
Prevent cluster updates triggered by cluster status updates
weyfonk Mar 20, 2026
56a21af
Use patch operations in cluster monitor
weyfonk Mar 23, 2026
772791c
Validate offline cluster status changes in both directions
weyfonk Mar 24, 2026
de3ad8e
Detect online clusters from online bundle deployments
weyfonk Mar 25, 2026
94a188f
Make linters happy
weyfonk Mar 25, 2026
da57fd6
Disconnect cluster instead of scaling Fleet agent deployment
weyfonk Mar 25, 2026
57ca339
Address Copilot's comments
weyfonk Mar 26, 2026
c419d10
Make linters happy
weyfonk Mar 26, 2026
63a964a
Skip single bundle deployment if already offline instead of whole set
weyfonk May 8, 2026
4a32093
Simplify setting cluster offline and back online from BDs
weyfonk May 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/scripts/deploy-fleet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ eventually helm upgrade --install fleet charts/fleet \
--set agentImage.tag="$agentTag" \
--set agentImage.imagePullPolicy=IfNotPresent \
--set bootstrap.agentNamespace=cattle-fleet-local-system \
--set agentCheckinInterval=5s \
--set clusterMonitor.enabled=true \
--set clusterMonitor.threshold=20s \
--set clusterMonitor.interval=10s \
--set apiServerCA="$ca" \
--set apiServerURL="$server" \
$shards_settings \
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/e2e-multicluster-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ jobs:
-p "443:443@agent:0:direct" \
--api-port 6443 \
--agents 1 \
--network "nw01"
--network "fleet"
-
name: Provision k3d Downstream Cluster for agent-initiated registration
run: |
Expand All @@ -91,7 +91,7 @@ jobs:
-p "444:443@agent:0:direct" \
--api-port 6644 \
--agents 1 \
--network "nw01"
--network "fleet"
-
name: Provision k3d Downstream Cluster for manager-initiated registration
run: |
Expand All @@ -100,7 +100,7 @@ jobs:
-p "445:443@agent:0:direct" \
--api-port 6645 \
--agents 1 \
--network "nw01"
--network "fleet"
-
name: Import Images Into k3d
run: |
Expand Down Expand Up @@ -213,6 +213,7 @@ jobs:
env:
FLEET_E2E_NS: fleet-local
FLEET_E2E_NS_DOWNSTREAM: fleet-default
FLEET_E2E_CLUSTER_DOWNSTREAM: k3d-downstream
run: |
# Force use of non-managed downstream cluster for portability
export CI_REGISTERED_CLUSTER=$(kubectl get clusters.fleet.cattle.io -n $FLEET_E2E_NS_DOWNSTREAM -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}' | grep -v second)
Expand Down
5 changes: 5 additions & 0 deletions charts/fleet/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ data:
"bundledeployment": "{{.Values.agent.reconciler.workers.bundledeployment}}",
"drift": "{{.Values.agent.reconciler.workers.drift}}"
},
"clusterMonitor": {
"enabled": {{.Values.clusterMonitor.enabled}},
"interval": "{{.Values.clusterMonitor.interval}}",
"threshold": "{{.Values.clusterMonitor.threshold}}"
},
{{ if .Values.garbageCollectionInterval }}
"garbageCollectionInterval": "{{.Values.garbageCollectionInterval}}",
{{ end }}
Expand Down
11 changes: 11 additions & 0 deletions charts/fleet/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,17 @@ bootstrap:
branch: master
paths: ""

## Cluster monitor for offline cluster detection
clusterMonitor:
enabled: false
# Determines how long must have elapsed since a downstream cluster's Fleet agent last reported its status to the
# management cluster, before that downstream cluster is considered offline.
# If this configured value is shorter than three times the agent check-in interval, then that check-in
# interval-based value will be used instead to prevent false positives.
threshold: "45m"
# Determines how often the cluster monitor will check for offline downstream clusters.
interval: "10m"

global:
cattle:
systemDefaultRegistry: ""
Expand Down
4 changes: 4 additions & 0 deletions dev/setup-fleet
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ helm -n cattle-fleet-system upgrade --install --create-namespace --wait --reset-
--set agent.leaderElection.leaseDuration=10s \
--set agent.leaderElection.retryPeriod=1s \
--set agent.leaderElection.renewDeadline=5s \
--set agentCheckinInterval=5s \
--set clusterMonitor.enabled=true \
--set clusterMonitor.threshold=20s \
--set clusterMonitor.interval=10s \
--set garbageCollectionInterval=1s \
--set insecureSkipHostKeyChecks=false \
--set imagescan.enabled=true \
Expand Down
3 changes: 2 additions & 1 deletion e2e/multi-cluster/installation/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ var _ = Describe("Fleet installation with TLS agent modes", func() {
"cattle-fleet-system",
"--type=merge",
"-p",
// Agent check-in interval cannot be 0. Any other value will do here.
fmt.Sprintf(
`{"data":{"config":"{\"apiServerURL\": \"https://google.com\", \"apiServerCA\": \"\", \"agentTLSMode\": \"%s\"}"}}`,
`{"data":{"config":"{\"apiServerURL\": \"https://google.com\", \"apiServerCA\": \"\", \"agentTLSMode\": \"%s\", \"agentCheckinInterval\": \"1m\"}"}}`,
agentMode,
),
)
Expand Down
167 changes: 167 additions & 0 deletions e2e/multi-cluster/offline_cluster_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
package multicluster_test

import (
"encoding/json"
"os/exec"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

corev1 "k8s.io/api/core/v1"

"github.com/rancher/fleet/e2e/testenv"
"github.com/rancher/fleet/e2e/testenv/kubectl"
"github.com/rancher/wrangler/v3/pkg/genericcondition"
)

// This test uses two clusters to demonstrate offline cluster handling.
var _ = Describe("Offline cluster detection", func() {
var (
k kubectl.Command
asset string
name string
)

BeforeEach(func() {
k = env.Kubectl.Context(env.Upstream)
})

Context("cluster with deployed workload becomes offline", func() {
BeforeEach(func() {
asset = "multi-cluster/helmop.yaml"
name = "test-offline-cluster"

err := testenv.ApplyTemplate(k.Namespace(env.ClusterRegistrationNamespace), testenv.AssetPath(asset), struct {
Name string
Namespace string
Repo string
Chart string
Version string
PollingInterval time.Duration
HelmSecretName string
InsecureSkipTLSVerify bool
}{
name,
env.ClusterRegistrationNamespace,
"",
"https://github.com/rancher/fleet/raw/refs/heads/main/integrationtests/cli/assets/helmrepository/config-chart-0.1.0.tgz",
"",
0,
"",
false,
})
Expect(err).ToNot(HaveOccurred())

DeferCleanup(func() {
connectCmd := exec.Command("docker", "network", "connect", "fleet", "k3d-"+k3dDownstreamCluster+"-server-0")
_, _ = connectCmd.CombinedOutput() // will fail if the node is already connected.

_, _ = k.Namespace(env.ClusterRegistrationNamespace).Delete("helmop", name)
})
})
It("marks any offline cluster as such, along with its bundle deployments", func() {
By("checking the initial online state of the cluster")
// Cluster should be ready
Eventually(func(g Gomega) {
out, err := k.Get(
"cluster", "-n", env.ClusterRegistrationNamespace,
"-o", `jsonpath={.items[0].status.conditions}`,
)
g.Expect(err).ToNot(HaveOccurred(), out)

checkReadyCondition(g, out, "", "True")
}).To(Succeed())

// Bundle deployment should be ready
Eventually(func(g Gomega) {
out, err := k.Get(
"bundledeployments", "-A",
"-l", "fleet.cattle.io/bundle-name="+name,
"-l", "fleet.cattle.io/bundle-namespace="+env.ClusterRegistrationNamespace,
"-o", `jsonpath={.items[0].status.conditions}`,
)
g.Expect(err).ToNot(HaveOccurred(), out)

checkReadyCondition(g, out, "", "True")
}).To(Succeed())

By("taking the cluster offline")
disconnectCmd := exec.Command("docker", "network", "disconnect", "fleet", "k3d-"+k3dDownstreamCluster+"-server-0")
out, err := disconnectCmd.CombinedOutput()
Expect(err).ToNot(HaveOccurred(), string(out))

By("checking that the bundle deployment and the cluster appear offline")
// Cluster should be offline
Eventually(func(g Gomega) {
out, err := k.Get(
"cluster", "-n", env.ClusterRegistrationNamespace,
"-o", `jsonpath={.items[0].status.conditions}`,
)
g.Expect(err).ToNot(HaveOccurred(), out)

checkReadyCondition(g, out, "cluster is offline", "Unknown")
}).To(Succeed())

// Bundle deployment should be offline
Eventually(func(g Gomega) {
out, err := k.Get(
"bundledeployments", "-A",
"-l", "fleet.cattle.io/bundle-name="+name,
"-l", "fleet.cattle.io/bundle-namespace="+env.ClusterRegistrationNamespace,
"-o", `jsonpath={.items[0].status.conditions}`,
)
g.Expect(err).ToNot(HaveOccurred(), out)

checkReadyCondition(g, out, "cluster is offline", "Unknown")
}).To(Succeed())

By("taking the cluster back online")
connectCmd := exec.Command("docker", "network", "connect", "fleet", "k3d-"+k3dDownstreamCluster+"-server-0")
out, err = connectCmd.CombinedOutput()
Expect(err).ToNot(HaveOccurred(), string(out))

By("checking the new online state of the cluster")
// Cluster should be ready again
Eventually(func(g Gomega) {
out, err := k.Get(
"cluster", "-n", env.ClusterRegistrationNamespace,
"-o", `jsonpath={.items[0].status.conditions}`,
)
g.Expect(err).ToNot(HaveOccurred(), out)

checkReadyCondition(g, out, "", "True")
}).To(Succeed())

// Bundle deployment should be ready again
Eventually(func(g Gomega) {
out, err := k.Get(
"bundledeployments", "-A",
"-l", "fleet.cattle.io/bundle-name="+name,
"-l", "fleet.cattle.io/bundle-namespace="+env.ClusterRegistrationNamespace,
"-o", `jsonpath={.items[0].status.conditions}`,
)
g.Expect(err).ToNot(HaveOccurred(), out)

checkReadyCondition(g, out, "", "True")
}).To(Succeed())
})
})
})

func checkReadyCondition(g Gomega, out, msg, status string) {
conds := []genericcondition.GenericCondition{}
err := json.Unmarshal([]byte(out), &conds)
g.Expect(err).ToNot(HaveOccurred())

var readyCond *genericcondition.GenericCondition
for i, c := range conds {
if c.Type == "Ready" {
readyCond = &conds[i]
}
}

g.Expect(readyCond).NotTo(BeNil())
g.Expect(readyCond.Message).To(ContainSubstring(msg))
g.Expect(readyCond.Status).To(Equal(corev1.ConditionStatus(status)))
Comment thread
weyfonk marked this conversation as resolved.
}
10 changes: 8 additions & 2 deletions e2e/multi-cluster/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package multicluster_test

import (
"os"
"strings"
"testing"

"github.com/rancher/fleet/e2e/testenv"
Expand All @@ -17,8 +18,9 @@ func TestE2E(t *testing.T) {
}

var (
env *testenv.Env
dsCluster = "second"
env *testenv.Env
dsCluster = "second" // name of the Fleet Cluster resource
k3dDownstreamCluster = "downstream"
)

var _ = BeforeSuite(func() {
Expand All @@ -30,4 +32,8 @@ var _ = BeforeSuite(func() {
if dsClusterEnvVar := os.Getenv("CI_REGISTERED_CLUSTER"); dsClusterEnvVar != "" {
dsCluster = dsClusterEnvVar
}

if k3dDSClusterEnvVar := os.Getenv("FLEET_E2E_CLUSTER_DOWNSTREAM"); k3dDSClusterEnvVar != "" {
k3dDownstreamCluster = strings.TrimPrefix(k3dDSClusterEnvVar, "k3d-")
}
})
Loading
Loading