Skip to content

Commit f4c962f

Browse files
authored
slemicro manual upgrade and node replacement support (rancher#231)
* slemicro upgrades * add wait for ssh ready * fix lint * review comments * update func name * move scp after reboot * mkdir /opt/data for path provisioner fix * k3s skip enable fix * lint fix * lint * lint * moving mkdir /opt/data to test * fix lint * fix lint * fix reboot timings * lint * reduce wait time * add pod describe and logs * typo fix * linter * lint * lint * testing delay changes * add vars to cluster * adding namespace for pod * change error msg * filename fix * try * adding debug log lines * check if increasing timeout helps * adding create /opt/data for agent nodes * revert time increase * re adding extra time for timeout * update master server * added return * try reboot in lps for slemicro * revert reboot * adding more logging * fix logs and remove create /var/../storage dir * fix lint * review comment fixes * review comment fixes * review fix * lint fix * reordered the nodes a bit * reverting resource name in cluster object * fix review comments * review fixes * comment fixes * split the slemicro into separate func * lint fix * lint fix
1 parent f371788 commit f4c962f

10 files changed

Lines changed: 532 additions & 116 deletions

File tree

pkg/aws/ec2.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,28 @@ func (c Client) StartInstance(instanceID string) error {
213213
return nil
214214
}
215215

216+
func (c Client) RebootInstance(instanceID string) error {
217+
if instanceID == "" {
218+
return shared.ReturnLogError("calling RebootInstance with empty instance ID, must send a valid instance ID")
219+
}
220+
221+
input := &ec2.RebootInstancesInput{
222+
InstanceIds: []*string{aws.String(instanceID)},
223+
}
224+
225+
_, err := c.ec2.RebootInstances(input)
226+
if err != nil {
227+
return shared.ReturnLogError("failed to reboot instance %s: %v", instanceID, err)
228+
}
229+
230+
startErr := c.waitForInstanceRunning(instanceID)
231+
if startErr != nil {
232+
return shared.ReturnLogError("timed out on reboot instance %s: %v", instanceID, startErr)
233+
}
234+
235+
return nil
236+
}
237+
216238
func (c Client) ReleaseElasticIps(ipAddress string) error {
217239
if ipAddress == "" {
218240
return shared.ReturnLogError("calling ReleaseElasticIps with empty ip address, must send a valid ip address")

pkg/testcase/clusterrestore.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ func newInstance(awsClient *aws.Client) (newServerName, newExternalIP string) {
111111
func installProduct(cluster *shared.Cluster, newClusterIP, version string) {
112112
setConfigFile(cluster, newClusterIP)
113113

114-
installCmd := shared.GetInstallCmd(cluster.Config.Product, version, "server")
114+
installCmd := shared.GetInstallCmd(cluster, version, "server")
115115
if cluster.Config.Product == "k3s" {
116116
skipInstall := fmt.Sprintf(" INSTALL_%s_SKIP_ENABLE=true ", strings.ToUpper(cluster.Config.Product))
117117
installCmd = strings.Replace(installCmd, "sh", skipInstall+" "+" sh", 1)

pkg/testcase/localpathstorage.go

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@ import (
99
. "github.com/onsi/gomega"
1010
)
1111

12-
var lps = "local-path-storage"
12+
var namespace = "local-path-storage"
1313

1414
func TestLocalPathProvisionerStorage(cluster *shared.Cluster, applyWorkload, deleteWorkload bool) {
15+
createDir(cluster)
16+
1517
var workloadErr error
1618
if applyWorkload {
1719
workloadErr = shared.ManageWorkload("apply", "local-path-provisioner.yaml")
@@ -24,22 +26,25 @@ func TestLocalPathProvisionerStorage(cluster *shared.Cluster, applyWorkload, del
2426
getPodVolumeTestRunning,
2527
statusRunning,
2628
)
29+
if err != nil {
30+
logDebugData(cluster)
31+
}
2732
Expect(err).NotTo(HaveOccurred(), err)
2833

29-
_, err = shared.WriteDataPod(cluster, lps)
34+
_, err = shared.WriteDataPod(cluster, namespace)
3035
Expect(err).NotTo(HaveOccurred(), "error writing data to pod: %v", err)
3136

3237
Eventually(func(g Gomega) {
3338
var res string
3439
shared.LogLevel("info", "Reading data from pod")
3540

36-
res, err = shared.ReadDataPod(cluster, lps)
41+
res, err = shared.ReadDataPod(cluster, namespace)
3742
g.Expect(err).NotTo(HaveOccurred())
3843
g.Expect(res).Should(ContainSubstring("testing local path"))
3944
g.Expect(err).NotTo(HaveOccurred())
4045
}, "300s", "5s").Should(Succeed())
4146

42-
_, err = shared.ReadDataPod(cluster, lps)
47+
_, err = shared.ReadDataPod(cluster, namespace)
4348
if err != nil {
4449
return
4550
}
@@ -66,10 +71,55 @@ func readData(cluster *shared.Cluster) error {
6671
delay := time.After(30 * time.Second)
6772
<-delay
6873

69-
_, err = shared.ReadDataPod(cluster, lps)
74+
_, err = shared.ReadDataPod(cluster, namespace)
7075
if err != nil {
7176
return err
7277
}
7378

7479
return nil
7580
}
81+
82+
func createDir(cluster *shared.Cluster) {
83+
shared.LogLevel("debug", "node OS: %s ", cluster.NodeOS)
84+
if cluster.NodeOS == "slemicro" {
85+
for _, ip := range append(cluster.ServerIPs, cluster.AgentIPs...) {
86+
shared.CreateDir("/opt/data", "+w", ip)
87+
}
88+
}
89+
}
90+
91+
// Logs the following debug data:
92+
// 1. pod log and describe pod output for 'helper-pod-create-pvc' pod.
93+
// 2. pod log and describe pod output for all pods in local-path-storage namespace
94+
// 3. kubectl get pv,pvc,storageclass output
95+
// 4. sestatus output
96+
// 5. grep audit logs for denied calls and log the same.
97+
func logDebugData(cluster *shared.Cluster) {
98+
// Pod log and describe pod output for 'helper-pod-create-pvc' pod
99+
shared.FindPodAndLog("helper-pod-create-pvc", "kube-system")
100+
101+
// Pod Log and describe pod output with namespace: local-path-storage
102+
shared.LogAllPodsForNamespace(namespace)
103+
104+
// Log the kubectl get pv,pvc,storageclass
105+
output, getErr := shared.KubectlCommand(cluster, "node", "get", "pv,pvc,storageclass", "-A")
106+
if getErr != nil {
107+
shared.LogLevel("error", "error getting pv,pvc and storageclass info")
108+
}
109+
if output != "" {
110+
shared.LogLevel("debug", "pv,pvc,storageclass info:\n %s", output)
111+
}
112+
113+
// Log sestatus output
114+
cmd := "sestatus"
115+
seStatusOut, statusLogErr := shared.RunCommandOnNode(cmd, cluster.ServerIPs[0])
116+
if statusLogErr != nil {
117+
shared.LogLevel("error", "error getting sestatus output")
118+
}
119+
if seStatusOut != "" {
120+
shared.LogLevel("debug", "sestatus:\n %s", seStatusOut)
121+
}
122+
123+
// Grep and Log the audit logs for denied messages
124+
shared.LogGrepOutput("/var/log/audit/audit.log", "denied", cluster.ServerIPs[0])
125+
}

pkg/testcase/upgrademanual.go

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"errors"
55
"fmt"
66

7+
"github.com/rancher/distros-test-framework/pkg/aws"
78
"github.com/rancher/distros-test-framework/pkg/k8s"
89
"github.com/rancher/distros-test-framework/shared"
910

@@ -14,6 +15,8 @@ const (
1415
server = "server"
1516
status = "status"
1617
restart = "restart"
18+
stop = "stop"
19+
start = "start"
1720
)
1821

1922
// TestUpgradeClusterManual upgrades the cluster "manually".
@@ -29,10 +32,14 @@ func TestUpgradeClusterManual(cluster *shared.Cluster, k8sClient *k8s.Client, ve
2932
return shared.ReturnLogError("no nodes found to upgrade")
3033
}
3134

35+
// Initialize aws client in case reboot is needed for slemicro
36+
shared.LogLevel("debug", "Testing Node OS: %s", cluster.NodeOS)
37+
awsClient := getAwsClient(cluster)
38+
3239
// Upgrades server nodes sequentially
3340
if cluster.NumServers > 0 {
3441
for _, ip := range cluster.ServerIPs {
35-
if err := upgradeProduct(cluster.Config.Product, server, version, ip); err != nil {
42+
if err := upgradeProduct(awsClient, cluster, server, version, ip); err != nil {
3643
shared.LogLevel("error", "error upgrading %s %s: %v", server, ip, err)
3744
return err
3845
}
@@ -42,7 +49,7 @@ func TestUpgradeClusterManual(cluster *shared.Cluster, k8sClient *k8s.Client, ve
4249
// Upgrades agent nodes sequentially
4350
if cluster.NumAgents > 0 {
4451
for _, ip := range cluster.AgentIPs {
45-
if err := upgradeProduct(cluster.Config.Product, agent, version, ip); err != nil {
52+
if err := upgradeProduct(awsClient, cluster, agent, version, ip); err != nil {
4653
shared.LogLevel("error", "error upgrading %s %s: %v", agent, ip, err)
4754
return err
4855
}
@@ -60,20 +67,37 @@ func TestUpgradeClusterManual(cluster *shared.Cluster, k8sClient *k8s.Client, ve
6067
return nil
6168
}
6269

63-
// upgradeProduct upgrades a node server or agent type to the specified version.
64-
func upgradeProduct(product, nodeType, installType, ip string) error {
65-
upgradeCommand := shared.GetInstallCmd(product, installType, nodeType)
70+
// nodeType can be server or agent.
71+
// installType can be version or commit.
72+
func runUpgradeCommand(cluster *shared.Cluster, nodeType, installType, ip string) error {
73+
upgradeCommand := shared.GetInstallCmd(cluster, installType, nodeType)
6674
shared.LogLevel("info", "Upgrading %s %s: %s", ip, nodeType, upgradeCommand)
6775
if _, err := shared.RunCommandOnNode(upgradeCommand, ip); err != nil {
6876
shared.LogLevel("error", "error running cmd on %s %s: %v", nodeType, ip, err)
6977
return err
7078
}
7179

72-
actions := []shared.ServiceAction{
73-
{Service: product, Action: restart, NodeType: nodeType, ExplicitDelay: 180},
74-
{Service: product, Action: status, NodeType: nodeType, ExplicitDelay: 30},
80+
return nil
81+
}
82+
83+
// upgradeProduct upgrades a node server or agent type to the specified version.
84+
func upgradeProduct(awsClient *aws.Client, cluster *shared.Cluster, nodeType, installType, ip string) error {
85+
nodeOS := cluster.NodeOS
86+
product := cluster.Config.Product
87+
88+
err := runUpgradeCommand(cluster, nodeType, installType, ip)
89+
if err != nil {
90+
return err
91+
}
92+
93+
if nodeOS == "slemicro" {
94+
rebootNodeAndWait(awsClient, ip)
7595
}
7696

97+
actions := []shared.ServiceAction{
98+
{Service: product, Action: restart, NodeType: nodeType, ExplicitDelay: 60},
99+
{Service: product, Action: status, NodeType: nodeType, ExplicitDelay: 120},
100+
}
77101
if product == "rke2" {
78102
ms := shared.NewManageService(3, 30)
79103
output, err := ms.ManageService(ip, actions)
@@ -88,7 +112,21 @@ func upgradeProduct(product, nodeType, installType, ip string) error {
88112

89113
if product == "k3s" {
90114
ms := shared.NewManageService(3, 10)
91-
output, err := ms.ManageService(ip, []shared.ServiceAction{actions[1]})
115+
var output string
116+
var err error
117+
if nodeOS == "slemicro" {
118+
sleActions := []shared.ServiceAction{
119+
{Service: product, Action: stop, NodeType: nodeType, ExplicitDelay: 30},
120+
{Service: product, Action: start, NodeType: nodeType, ExplicitDelay: 30},
121+
{Service: product, Action: status, NodeType: nodeType, ExplicitDelay: 120},
122+
}
123+
output, err = ms.ManageService(ip, sleActions)
124+
} else {
125+
k3sActions := []shared.ServiceAction{
126+
{Service: product, Action: status, NodeType: nodeType, ExplicitDelay: 30},
127+
}
128+
output, err = ms.ManageService(ip, k3sActions)
129+
}
92130
if output != "" {
93131
Expect(output).To(ContainSubstring("active "),
94132
fmt.Sprintf("error running %s service %s on %s node: %s", product, status, nodeType, ip))

0 commit comments

Comments
 (0)