Skip to content

Commit 572bca1

Browse files
[release-4.18] [RDR] Neutral ACM hub failure and recovery (#12039)
Signed-off-by: am-agrawa <[email protected]> Co-authored-by: am-agrawa <[email protected]>
1 parent caa2037 commit 572bca1

File tree

1 file changed

+370
-0
lines changed

1 file changed

+370
-0
lines changed
Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
import logging
2+
3+
import time
4+
5+
from concurrent.futures import ThreadPoolExecutor
6+
7+
import pytest
8+
9+
from ocs_ci.framework.pytest_customization.marks import (
10+
turquoise_squad,
11+
dr_hub_recovery,
12+
tier2,
13+
)
14+
from ocs_ci.framework import config
15+
from ocs_ci.helpers import dr_helpers
16+
from ocs_ci.ocs.acm.acm import (
17+
validate_cluster_import,
18+
)
19+
from ocs_ci.ocs import constants
20+
from ocs_ci.ocs.node import get_node_objs, wait_for_nodes_status
21+
from ocs_ci.helpers.dr_helpers import (
22+
failover,
23+
restore_backup,
24+
get_current_primary_cluster_name,
25+
get_current_secondary_cluster_name,
26+
get_passive_acm_index,
27+
wait_for_all_resources_creation,
28+
verify_drpolicy_cli,
29+
verify_restore_is_completed,
30+
wait_for_all_resources_deletion,
31+
relocate,
32+
get_scheduling_interval,
33+
create_klusterlet_config,
34+
remove_parameter_klusterlet_config,
35+
configure_rdr_hub_recovery,
36+
)
37+
from ocs_ci.ocs.exceptions import UnexpectedBehaviour
38+
from ocs_ci.ocs.resources.drpc import DRPC
39+
from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_running
40+
from ocs_ci.ocs.utils import get_active_acm_index
41+
42+
from ocs_ci.utility.utils import TimeoutSampler, run_cmd
43+
44+
logger = logging.getLogger(__name__)
45+
46+
47+
@tier2
48+
@turquoise_squad
49+
@dr_hub_recovery
50+
@pytest.mark.order("last")
51+
class TestNeutralHubFailureAndRecovery:
52+
"""
53+
Perform hub failure where active hub is at a neutral site and then perform hub recovery
54+
by moving to passive hub using backup and restore, test failover by bringing primary managed cluster down,
55+
recover it and then perform relocate operation.
56+
"""
57+
58+
def test_neutral_hub_recovery_and_dr(self, dr_workload, nodes_multicluster):
59+
"""
60+
Test to verify failover and relocate of all workloads after switching to passive hub post hub recovery
61+
"""
62+
63+
# Deploy Subscription and Appset based application of both RBD and CephFS SC
64+
rdr_workload = dr_workload(
65+
num_of_subscription=1,
66+
num_of_appset=1,
67+
pvc_interface=constants.CEPHBLOCKPOOL,
68+
switch_ctx=get_passive_acm_index(),
69+
)
70+
dr_workload(
71+
num_of_subscription=1,
72+
num_of_appset=1,
73+
pvc_interface=constants.CEPHFILESYSTEM,
74+
switch_ctx=get_passive_acm_index(),
75+
)
76+
drpc_objs = []
77+
for wl in rdr_workload:
78+
if wl.workload_type == constants.SUBSCRIPTION:
79+
drpc_objs.append(DRPC(namespace=wl.workload_namespace))
80+
else:
81+
drpc_objs.append(
82+
DRPC(
83+
namespace=constants.GITOPS_CLUSTER_NAMESPACE,
84+
resource_name=f"{wl.appset_placement_name}-drpc",
85+
)
86+
)
87+
88+
primary_cluster_name = get_current_primary_cluster_name(
89+
rdr_workload[0].workload_namespace
90+
)
91+
secondary_cluster_name = get_current_secondary_cluster_name(
92+
rdr_workload[0].workload_namespace
93+
)
94+
95+
# Verify the creation of ReplicationDestination resources on secondary cluster in case of CephFS
96+
config.switch_to_cluster_by_name(secondary_cluster_name)
97+
for wl in rdr_workload:
98+
if wl.pvc_interface == constants.CEPHFILESYSTEM:
99+
dr_helpers.wait_for_replication_destinations_creation(
100+
wl.workload_pvc_count, wl.workload_namespace
101+
)
102+
103+
scheduling_interval = get_scheduling_interval(
104+
rdr_workload[0].workload_namespace, rdr_workload[0].workload_type
105+
)
106+
107+
two_times_scheduling_interval = 2 * scheduling_interval # Time in minutes
108+
wait_time = 360
109+
110+
assert configure_rdr_hub_recovery()
111+
112+
# Get the active hub cluster nodes
113+
logger.info("Getting Active Hub cluster node details")
114+
config.switch_ctx(get_active_acm_index())
115+
active_hub_index = config.cur_index
116+
active_hub_cluster_node_objs = get_node_objs()
117+
118+
drpc_cmd = run_cmd("oc get drpc -o wide -A")
119+
logger.info(
120+
"DRPC output from current active hub cluster before shutting it down"
121+
)
122+
logger.info(drpc_cmd)
123+
124+
# Shutdown active hub cluster nodes
125+
logger.info("Shutting down all nodes of active hub cluster")
126+
nodes_multicluster[active_hub_index].stop_nodes(active_hub_cluster_node_objs)
127+
logger.info(
128+
"All nodes of active hub cluster are powered off, "
129+
"wait 480 seconds before restoring backups on the passive hub"
130+
)
131+
time.sleep(480)
132+
133+
config.switch_ctx(get_passive_acm_index())
134+
# Create KlusterletConfig
135+
logger.info("Create klusterletconfig on passive hub")
136+
create_klusterlet_config()
137+
138+
# Restore new hub
139+
logger.info("Restore backups on the passive hub cluster")
140+
restore_backup()
141+
logger.info(f"Wait {wait_time} until restores are taken ")
142+
time.sleep(wait_time)
143+
144+
# Verify the restore is completed
145+
logger.info("Verify if backup restore is successful or not")
146+
verify_restore_is_completed()
147+
148+
# Validate if the managed clusters are successfully imported on the new hub
149+
for cluster in [primary_cluster_name, secondary_cluster_name]:
150+
for sample in TimeoutSampler(
151+
timeout=1800,
152+
sleep=15,
153+
func=validate_cluster_import,
154+
cluster_name=cluster,
155+
switch_ctx=get_passive_acm_index(),
156+
):
157+
if sample:
158+
logger.info(
159+
f"Cluster: {cluster} successfully imported post hub recovery"
160+
)
161+
# Validate klusterlet addons are running on managed cluster
162+
config.switch_to_cluster_by_name(cluster)
163+
wait_for_pods_to_be_running(
164+
namespace=constants.ACM_ADDONS_NAMESPACE, timeout=300, sleep=15
165+
)
166+
break
167+
else:
168+
logger.error(
169+
f"import of cluster: {cluster} failed post hub recovery"
170+
)
171+
raise UnexpectedBehaviour(
172+
f"import of cluster: {cluster} failed post hub recovery"
173+
)
174+
# Wait for drpolicy to be in validated state
175+
logger.info("Verify status of DR Policy on the new hub")
176+
verify_drpolicy_cli(switch_ctx=get_passive_acm_index())
177+
178+
logger.info(f"Wait for {wait_time} for drpc status to be restored")
179+
time.sleep(wait_time)
180+
181+
config.switch_ctx(get_passive_acm_index())
182+
drpc_cmd = run_cmd("oc get drpc -o wide -A")
183+
logger.info(
184+
"DRPC output from new hub cluster before shutting down the primary managed cluster"
185+
)
186+
logger.info(drpc_cmd)
187+
188+
# Get the primary managed cluster nodes
189+
logger.info("Getting Primary managed cluster node details")
190+
config.switch_to_cluster_by_name(primary_cluster_name)
191+
active_primary_index = config.cur_index
192+
active_primary_cluster_node_objs = get_node_objs()
193+
194+
# Shutdown primary managed cluster nodes
195+
logger.info("Shutting down all the nodes of primary managed cluster")
196+
nodes_multicluster[active_primary_index].stop_nodes(
197+
active_primary_cluster_node_objs
198+
)
199+
logger.info("All nodes of primary managed cluster are powered off")
200+
time.sleep(480)
201+
202+
# Failover action via CLI
203+
logger.info(
204+
"Failover workloads after hub recovery when the primary managed cluster is intentionally shutdown"
205+
)
206+
failover_results = []
207+
with ThreadPoolExecutor() as executor:
208+
for wl in rdr_workload:
209+
failover_results.append(
210+
executor.submit(
211+
failover,
212+
failover_cluster=secondary_cluster_name,
213+
namespace=wl.workload_namespace,
214+
workload_type=wl.workload_type,
215+
workload_placement_name=(
216+
wl.appset_placement_name
217+
if wl.workload_type != constants.SUBSCRIPTION
218+
else None
219+
),
220+
switch_ctx=get_passive_acm_index(),
221+
)
222+
)
223+
time.sleep(5)
224+
225+
# Wait for failover results
226+
for fl in failover_results:
227+
fl.result()
228+
229+
# Verify resources creation on secondary cluster (failoverCluster)
230+
config.switch_to_cluster_by_name(secondary_cluster_name)
231+
for wl in rdr_workload:
232+
wait_for_all_resources_creation(
233+
wl.workload_pvc_count,
234+
wl.workload_pod_count,
235+
wl.workload_namespace,
236+
)
237+
238+
config.switch_ctx(get_passive_acm_index())
239+
drpc_cmd = run_cmd("oc get drpc -o wide -A")
240+
logger.info("DRPC output from new hub cluster after successful failover")
241+
logger.info(drpc_cmd)
242+
243+
config.switch_to_cluster_by_name(primary_cluster_name)
244+
logger.info("Recover the primary managed cluster")
245+
nodes_multicluster[active_primary_index].start_nodes(
246+
active_primary_cluster_node_objs
247+
)
248+
wait_for_nodes_status([node.name for node in active_primary_cluster_node_objs])
249+
250+
# Edit the global KlusterletConfig on the new hub and remove
251+
# the parameter appliedManifestWorkEvictionGracePeriod and its value.
252+
# appliedManifestWorkEvictionGracePeriod should only be removed if
253+
# no DRPCs are in the Paused `PROGRESSION` or if `PROGRESSION` is in Cleaning Up state in case workloads are
254+
# successfully FailedOver or Relocated after hub recovery was performed`
255+
logger.info(
256+
"Edit the global KlusterletConfig on the new hub and "
257+
"remove the parameter appliedManifestWorkEvictionGracePeriod and its value."
258+
)
259+
remove_parameter_klusterlet_config()
260+
261+
logger.info(
262+
"Wait for approx. an hour to surpass 1hr of default eviction period timeout"
263+
)
264+
time.sleep(3600)
265+
266+
config.switch_to_cluster_by_name(primary_cluster_name)
267+
268+
# Verify application are deleted from old cluster
269+
for wl in rdr_workload:
270+
wait_for_all_resources_deletion(wl.workload_namespace, timeout=1800)
271+
272+
for wl in rdr_workload:
273+
if wl.pvc_interface == constants.CEPHFILESYSTEM:
274+
# Verify the deletion of ReplicationDestination resources on secondary cluster
275+
config.switch_to_cluster_by_name(secondary_cluster_name)
276+
dr_helpers.wait_for_replication_destinations_deletion(
277+
wl.workload_namespace
278+
)
279+
# Verify the creation of ReplicationDestination resources on primary cluster
280+
config.switch_to_cluster_by_name(primary_cluster_name)
281+
dr_helpers.wait_for_replication_destinations_creation(
282+
wl.workload_pvc_count, wl.workload_namespace
283+
)
284+
285+
dr_helpers.wait_for_mirroring_status_ok(
286+
replaying_images=sum(
287+
[
288+
wl.workload_pvc_count
289+
for wl in rdr_workload
290+
if wl.pvc_interface == constants.CEPHBLOCKPOOL
291+
]
292+
)
293+
)
294+
295+
logger.info(f"Waiting for {two_times_scheduling_interval} minutes to run IOs")
296+
time.sleep(two_times_scheduling_interval * 60)
297+
298+
config.switch_ctx(get_passive_acm_index())
299+
logger.info(
300+
"DRPC output from new hub cluster after successful failover and cleanup"
301+
)
302+
drpc_cmd = run_cmd("oc get drpc -o wide -A")
303+
logger.info(drpc_cmd)
304+
305+
relocate_results = []
306+
with ThreadPoolExecutor() as executor:
307+
for wl in rdr_workload:
308+
relocate_results.append(
309+
executor.submit(
310+
relocate,
311+
preferred_cluster=primary_cluster_name,
312+
namespace=wl.workload_namespace,
313+
workload_type=wl.workload_type,
314+
workload_placement_name=(
315+
wl.appset_placement_name
316+
if wl.workload_type != constants.SUBSCRIPTION
317+
else None
318+
),
319+
switch_ctx=get_passive_acm_index(),
320+
)
321+
)
322+
time.sleep(5)
323+
324+
# Wait for relocate results
325+
for rl in relocate_results:
326+
rl.result()
327+
328+
config.switch_ctx(get_passive_acm_index())
329+
drpc_cmd = run_cmd("oc get drpc -o wide -A")
330+
logger.info("DRPC output from new hub cluster after relocate")
331+
logger.info(drpc_cmd)
332+
333+
# Verify resources creation on preferredCluster
334+
config.switch_to_cluster_by_name(primary_cluster_name)
335+
for wl in rdr_workload:
336+
wait_for_all_resources_creation(
337+
wl.workload_pvc_count,
338+
wl.workload_pod_count,
339+
wl.workload_namespace,
340+
)
341+
342+
for wl in rdr_workload:
343+
if wl.pvc_interface == constants.CEPHFILESYSTEM:
344+
# Verify the deletion of ReplicationDestination resources on primary cluster
345+
config.switch_to_cluster_by_name(primary_cluster_name)
346+
dr_helpers.wait_for_replication_destinations_deletion(
347+
wl.workload_namespace
348+
)
349+
# Verify the creation of ReplicationDestination resources on secondary cluster
350+
config.switch_to_cluster_by_name(secondary_cluster_name)
351+
dr_helpers.wait_for_replication_destinations_creation(
352+
wl.workload_pvc_count, wl.workload_namespace
353+
)
354+
355+
dr_helpers.wait_for_mirroring_status_ok(
356+
replaying_images=sum(
357+
[
358+
wl.workload_pvc_count
359+
for wl in rdr_workload
360+
if wl.pvc_interface == constants.CEPHBLOCKPOOL
361+
]
362+
)
363+
)
364+
365+
# Verify resources deletion from previous primary or current secondary cluster
366+
config.switch_to_cluster_by_name(secondary_cluster_name)
367+
for wl in rdr_workload:
368+
wait_for_all_resources_deletion(wl.workload_namespace)
369+
370+
logger.info("Relocate successful")

0 commit comments

Comments
 (0)