Skip to content

Commit caa2037

Browse files
authored
[4.18 cherry-pick of #9912] [RDR] [Co-situated hub] Hub recovery after site-failure and failover/relocate of DR workloads (#11964)
Signed-off-by: am-agrawa <[email protected]>
1 parent 3479818 commit caa2037

File tree

8 files changed

+498
-6
lines changed

8 files changed

+498
-6
lines changed

ocs_ci/deployment/deployment.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3443,14 +3443,20 @@ def validate_secret_creation_oadp(self):
34433443
except CommandFailed:
34443444
raise ResourceNotFoundError("Secret Not found")
34453445

3446+
@retry(
3447+
exception_to_check=ResourceWrongStatusException, # or a specific one
3448+
tries=8,
3449+
delay=15,
3450+
backoff=2,
3451+
)
34463452
def validate_policy_compliance_status(
34473453
self, resource_name, resource_namespace, compliance_state
34483454
):
34493455
"""
34503456
Validate policy status for given resource
34513457
3452-
Raises:
3453-
ResourceWrongStatusException: Raised when resource state does not match
3458+
Returns: True if compliance check passes else raises ResourceWrongStatusException when resource state
3459+
does not match
34543460
34553461
"""
34563462

@@ -3462,6 +3468,7 @@ def validate_policy_compliance_status(
34623468
compliance_status = compliance_output.get()
34633469
if compliance_status["status"]["compliant"] == compliance_state:
34643470
logger.info("Compliance status Matches ")
3471+
return True
34653472
else:
34663473
raise ResourceWrongStatusException("Compliance status does not match")
34673474

ocs_ci/framework/pytest_customization/marks.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,11 @@
594594
reason="RDR UI failover or relocate config needed",
595595
)
596596

597+
dr_hub_recovery = pytest.mark.skipif(
598+
config.nclusters != 4,
599+
reason="DR hub recovery requires 4th OCP cluster to be available for Passive hub",
600+
)
601+
597602
# Filter warnings
598603
filter_insecure_request_warning = pytest.mark.filterwarnings(
599604
"ignore::urllib3.exceptions.InsecureRequestWarning"

ocs_ci/helpers/dr_helpers.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
get_passive_acm_index,
3030
enable_mco_console_plugin,
3131
set_recovery_as_primary,
32+
get_all_acm_indexes,
3233
)
3334
from ocs_ci.utility import version, templating
3435
from ocs_ci.utility.retry import retry
@@ -43,6 +44,7 @@
4344
run_cmd_verify_cli_output,
4445
find_cephblockpoolradosnamespace,
4546
find_cephfilesystemsubvolumegroup,
47+
create_unique_resource_name,
4648
)
4749

4850
logger = logging.getLogger(__name__)
@@ -1103,7 +1105,9 @@ def get_all_drpolicy():
11031105

11041106

11051107
def verify_last_group_sync_time(
1106-
drpc_obj, scheduling_interval, initial_last_group_sync_time=None
1108+
drpc_obj,
1109+
scheduling_interval,
1110+
initial_last_group_sync_time=None,
11071111
):
11081112
"""
11091113
Verifies that the lastGroupSyncTime for a given DRPC object is within the expected range.
@@ -1401,6 +1405,9 @@ def restore_backup():
14011405
restore_index = config.cur_index
14021406
config.switch_ctx(get_passive_acm_index())
14031407
restore_schedule = templating.load_yaml(constants.DR_RESTORE_YAML)
1408+
restore_schedule["metadata"]["name"] = create_unique_resource_name(
1409+
resource_description="acm", resource_type="restore"
1410+
)
14041411
restore_schedule_yaml = tempfile.NamedTemporaryFile(
14051412
mode="w+", prefix="restore", delete=False
14061413
)
@@ -1547,6 +1554,7 @@ def create_klusterlet_config():
15471554
)
15481555
templating.dump_data_to_temp_yaml(klusterlet_config, klusterlet_config_yaml.name)
15491556
run_cmd(f"oc create -f {klusterlet_config_yaml.name}")
1557+
logger.info("Klusterletconfig is successfully created on the passive hub")
15501558
config.switch_ctx(old_ctx)
15511559

15521560

@@ -1564,6 +1572,9 @@ def remove_parameter_klusterlet_config():
15641572
klusterlet_config_obj.patch(
15651573
resource_name=name, params=json.dumps(remove_op), format_type="json"
15661574
)
1575+
logger.info(
1576+
"appliedManifestWorkEvictionGracePeriod and it's value is successfully removed from KlusterletConfig"
1577+
)
15671578
config.switch_ctx(old_ctx)
15681579

15691580

@@ -1915,3 +1926,50 @@ def verify_last_kubeobject_protection_time(drpc_obj, kubeobject_sync_interval):
19151926
logger.info("Verified lastKubeObjectProtectionTime value within expected range")
19161927
config.switch_ctx(restore_index)
19171928
return last_kubeobject_protection_time
1929+
1930+
1931+
def configure_rdr_hub_recovery():
1932+
"""
1933+
RDR helper function to create backup schedule on the active hub cluster needed for hub recovery
1934+
using backup and restore.
1935+
1936+
This function ensures all pre-reqs are verified before hub recovery is performed.
1937+
1938+
"""
1939+
# Create backup-schedule on active hub
1940+
config.switch_acm_ctx()
1941+
logger.info("Create backup schedule on the active hub cluster")
1942+
create_backup_schedule()
1943+
wait_time = 420
1944+
logger.info(f"Wait {wait_time} seconds until backup is taken ")
1945+
time.sleep(wait_time)
1946+
logger.info(
1947+
"Check pre-reqs on both the hub clusters before performing hub recovery"
1948+
)
1949+
acm_indexes = get_all_acm_indexes()
1950+
for _ in acm_indexes:
1951+
config.switch_ctx(_)
1952+
verify_backup_is_taken()
1953+
# To avoid circular import
1954+
from ocs_ci.deployment.deployment import (
1955+
Deployment,
1956+
get_multicluster_dr_deployment,
1957+
)
1958+
1959+
dr_conf = Deployment().get_rdr_conf()
1960+
rdrclass_obj = get_multicluster_dr_deployment()(dr_conf)
1961+
rdrclass_obj.validate_dpa()
1962+
assert rdrclass_obj.validate_policy_compliance_status(
1963+
resource_name="backup-restore-enabled",
1964+
resource_namespace="open-cluster-management-backup",
1965+
compliance_state="Compliant",
1966+
)
1967+
config.switch_ctx(get_passive_acm_index())
1968+
logger.info(
1969+
"Add label for cluster-monitoring needed to fire VolumeSyncronizationDelay alert on the Hub cluster"
1970+
)
1971+
exec_cmd(
1972+
"oc label namespace openshift-operators openshift.io/cluster-monitoring='true'"
1973+
)
1974+
logger.info("All pre-reqs verified for performing hub recovery")
1975+
return True

ocs_ci/ocs/dr/dr_workload.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def __init__(self, **kwargs):
7272

7373
self.workload_type = kwargs.get("workload_type", constants.SUBSCRIPTION)
7474
self.workload_namespace = kwargs.get("workload_namespace", None)
75+
self.pvc_interface = kwargs.get("pvc_interface", None)
7576
self.app_name = kwargs.get("app_name", None)
7677
self.workload_pod_count = kwargs.get("workload_pod_count")
7778
self.workload_pvc_count = kwargs.get("workload_pvc_count")
@@ -466,6 +467,7 @@ def __init__(self, **kwargs):
466467

467468
self.workload_type = kwargs.get("workload_type", constants.APPLICATION_SET)
468469
self.workload_namespace = kwargs.get("workload_namespace", None)
470+
self.pvc_interface = kwargs.get("pvc_interface", None)
469471
self.workload_pod_count = kwargs.get("workload_pod_count")
470472
self.workload_pvc_count = kwargs.get("workload_pvc_count")
471473
self.dr_policy_name = kwargs.get(

ocs_ci/templates/ocs-deployment/multicluster/backupschedule.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ metadata:
44
name: schedule-acm
55
namespace: open-cluster-management-backup
66
spec:
7-
veleroSchedule: 0 */1 * * *
8-
veleroTtl: 48h
7+
veleroSchedule: "*/5 * * * *"
8+
veleroTtl: 96h
99
useManagedServiceAccount: true
1010
managedServiceAccountTTL: 96h

ocs_ci/templates/ocs-deployment/multicluster/klusterlet_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ kind: KlusterletConfig
33
metadata:
44
name: global
55
spec:
6-
appliedManifestWorkEvictionGracePeriod: "24h"
6+
appliedManifestWorkEvictionGracePeriod: "168h"

tests/conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6886,6 +6886,7 @@ def factory(
68866886
workload_dir=workload_details["workload_dir"],
68876887
workload_pod_count=workload_details["pod_count"],
68886888
workload_pvc_count=workload_details["pvc_count"],
6889+
pvc_interface=pvc_interface,
68896890
)
68906891
instances.append(workload)
68916892
total_pvc_count += workload_details["pvc_count"]
@@ -6905,6 +6906,7 @@ def factory(
69056906
],
69066907
workload_pvc_selector=workload_details["dr_workload_app_pvc_selector"],
69076908
appset_model=appset_model,
6909+
pvc_interface=pvc_interface,
69086910
)
69096911
instances.append(workload)
69106912
total_pvc_count += workload_details["pvc_count"]

0 commit comments

Comments
 (0)