[active-active] Fix failover version increment logic (cadence-workflow#7246)

taylanisikdemir · web-flow · commit 2f3beb065ea6 · 2025-09-10T13:31:40.000-07:00
**What changed?** Active-active domain failovers (changing the region to cluster map) increment the failover version of the updated entries. Rest of the entries are untouched. This causes a problem when an ongoing workflow is attempted to be started after failover. Example scenario: - Domain's initial state ``` ActiveClustersByRegion: [ phx: {staging_phx 2} dca: {staging_dca 0} ] ``` - A workflow with id `cron.phx` is running with version 2. It's active on staging_phx cluster. - Domain is failed over from PHX to DCA by operator/automation. The map in DB now looks like this: ``` ActiveClustersByRegion: [ phx: {staging_dca 10} # version is incremented from 2 to 10. dca: {staging_dca 0} ] ``` - A `StartWorkflow(cron.phx)` request is made by a client. - Call arrives to PHX frontend. - It checks the `ActiveClustersByRegion` of the domain and decides to forward to DCA frontend. - DCA frontend receives the request and makes corresponding request to DCA history. - The history engine responsible from `cron.phx` shard processes the request. - It gets workflow already started error and checks the new mutable state version with the existing one in DB. - Existing mutable state in DB (which was replicated from PHX cluster) has version 2. - New mutable state in memory has version 0 which is the version DCA uses. - New version is less than previous version so it returns domain not active error [ref](https://github.com/cadence-workflow/cadence/blob/147489a7e507a04eade6594854234396daebcd8f/service/history/engine/engineimpl/start_workflow_execution.go#L253) The problem is caused by multiple entries in the `ActiveClustersByRegion` map pointing to the same cluster but having different versions. One way to prevent getting into this state is to increment failover versions of all entries pointing to the same cluster when updating ActiveClustersByRegion. Before: ``` ActiveClustersByRegion: [ phx: {staging_phx 2} dca: {staging_dca 0} ] ``` After: ``` ActiveClustersByRegion: [ phx: {staging_dca 10} # incremented from 2 to 10 so it points to dca cluster dca: {staging_dca 10} # incremented from 0 to 10 so it's greater than or equal to the entry that was updated (above) ] ``` **Validation Change** Another change in this PR is to restrict what kind of updates are allowed for `ActiveClustersByRegion` map. It doesn't make sense to support multiple hops and we should prevent cycles. For example below map contains multiple hops and will not be allowed. ``` ActiveClustersByRegion: [ phx: {staging_dca} dca: {staging_klm} klm: {staging_klm} ] ```  **How did you test it?** - unit tests - new simulation
diff --git a/.github/workflows/replication-simulation.yml b/.github/workflows/replication-simulation.yml
@@ -16,6 +16,7 @@ jobs:
           - activeactive
           - activeactive_cron
           - activeactive_regional_failover
+          - activeactive_regional_failover_start_same_wfid
           - activepassive_to_activeactive
           - clusterredirection
           - default
diff --git a/common/domain/attrValidator.go b/common/domain/attrValidator.go
@@ -22,6 +22,7 @@ package domain
 
 import (
 	"fmt"
+	"strings"
 
 	"github.com/uber/cadence/common/cluster"
 	"github.com/uber/cadence/common/persistence"
@@ -117,6 +118,7 @@ func (d *AttrValidatorImpl) validateDomainReplicationConfigForGlobalDomain(
 	}
 
 	if replicationConfig.IsActiveActive() {
+		// validate cluster names and check whether they exist
 		for _, cluster := range activeClusters.ActiveClustersByRegion {
 			if err := d.validateClusterName(cluster.ActiveClusterName); err != nil {
 				return err
@@ -126,6 +128,12 @@ func (d *AttrValidatorImpl) validateDomainReplicationConfigForGlobalDomain(
 				return errActiveClusterNotInClusters
 			}
 		}
+
+		// check region mappings are valid
+		err := d.checkActiveClusterRegionMappings(activeClusters)
+		if err != nil {
+			return err
+		}
 	} else {
 		if err := d.validateClusterName(activeCluster); err != nil {
 			return err
@@ -177,3 +185,40 @@ func (d *AttrValidatorImpl) validateClusterName(
 	}
 	return nil
 }
+
+// checkActiveClusterRegionMappings validates:
+//  1. There's no cycle in region dependencies.
+//     e.g. Following not allowed: region0 maps to a cluster in region1, and region1 maps to a cluster in region0.
+//  2. There's at most one hop in the region dependency chain.
+//     e.g. Following not allowed: region0 maps to a cluster in region1, and region1 maps to a cluster in region2
+func (d *AttrValidatorImpl) checkActiveClusterRegionMappings(activeClusters *types.ActiveClusters) error {
+	inbounds := make(map[string][]string)
+	outbounds := make(map[string]string)
+	allClusters := d.clusterMetadata.GetAllClusterInfo()
+	for fromRegion, cluster := range activeClusters.ActiveClustersByRegion {
+		clusterInfo, ok := allClusters[cluster.ActiveClusterName]
+		if !ok {
+			return &types.BadRequestError{Message: fmt.Sprintf("Cluster %v not found", cluster.ActiveClusterName)}
+		}
+
+		toRegion := clusterInfo.Region
+		if fromRegion == toRegion {
+			continue
+		}
+
+		inbounds[toRegion] = append(inbounds[toRegion], fromRegion)
+		outbounds[fromRegion] = toRegion
+	}
+
+	// The entries that point to a cluster in the same region is omitted in inbounds and outbounds
+	// So if a region X is in inbounds it means a cluster in X region is used by other region(s).
+	// Region X must not be in outbounds. (allow at most one hop rule)
+	// Validating this also ensures that there's no cycle in region dependencies.
+	for toRegion := range inbounds {
+		if _, ok := outbounds[toRegion]; ok {
+			return &types.BadRequestError{Message: "Region " + toRegion + " cannot map to a cluster in another region because it is used as target region by other regions: " + strings.Join(inbounds[toRegion], ", ")}
+		}
+	}
+
+	return nil
+}
diff --git a/common/domain/attrValidator_test.go b/common/domain/attrValidator_test.go
@@ -23,9 +23,13 @@ package domain
 import (
 	"testing"
 
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/suite"
 
 	"github.com/uber/cadence/common/cluster"
+	"github.com/uber/cadence/common/config"
+	"github.com/uber/cadence/common/log"
+	"github.com/uber/cadence/common/metrics"
 	"github.com/uber/cadence/common/persistence"
 	"github.com/uber/cadence/common/types"
 )
@@ -284,3 +288,89 @@ func (s *attrValidatorSuite) TestValidateDomainReplicationConfigClustersDoesNotR
 	)
 	s.IsType(&types.BadRequestError{}, err)
 }
+
+func TestCheckActiveClusterRegionMappings(t *testing.T) {
+	clusterMetadata := cluster.NewMetadata(
+		config.ClusterGroupMetadata{
+			ClusterGroup: map[string]config.ClusterInformation{
+				"A1": {
+					Region: "A",
+				},
+				"A2": {
+					Region: "A",
+				},
+				"B1": {
+					Region: "B",
+				},
+				"B2": {
+					Region: "B",
+				},
+				"C1": {
+					Region: "C",
+				},
+				"C2": {
+					Region: "C",
+				},
+			},
+		},
+		func(d string) bool { return false },
+		metrics.NewNoopMetricsClient(),
+		log.NewNoop(),
+	)
+
+	tests := []struct {
+		desc           string
+		activeClusters *types.ActiveClusters
+		wantErr        bool
+	}{
+		{
+			desc: "non-existing cluster",
+			activeClusters: &types.ActiveClusters{ActiveClustersByRegion: map[string]types.ActiveClusterInfo{
+				"D": {ActiveClusterName: "D1"},
+			}},
+			wantErr: true,
+		},
+		{
+			desc: "no cycle. every region is mapped to a local cluster",
+			activeClusters: &types.ActiveClusters{ActiveClustersByRegion: map[string]types.ActiveClusterInfo{
+				"A": {ActiveClusterName: "A1"},
+				"B": {ActiveClusterName: "B1"},
+				"C": {ActiveClusterName: "C1"},
+			}},
+			wantErr: false,
+		},
+		{
+			desc: "no cycle. A and C failed over to B",
+			activeClusters: &types.ActiveClusters{ActiveClustersByRegion: map[string]types.ActiveClusterInfo{
+				"A": {ActiveClusterName: "B1"},
+				"B": {ActiveClusterName: "B1"},
+				"C": {ActiveClusterName: "B1"},
+			}},
+			wantErr: false,
+		},
+		{
+			desc: "cycle. A -> B -> C -> A",
+			activeClusters: &types.ActiveClusters{ActiveClustersByRegion: map[string]types.ActiveClusterInfo{
+				"A": {ActiveClusterName: "B2"},
+				"B": {ActiveClusterName: "C2"},
+				"C": {ActiveClusterName: "A2"},
+			}},
+			wantErr: true,
+		},
+		{
+			desc: "no cycle but more than one hop. A -> B -> C",
+			activeClusters: &types.ActiveClusters{ActiveClustersByRegion: map[string]types.ActiveClusterInfo{
+				"A": {ActiveClusterName: "B2"},
+				"B": {ActiveClusterName: "C2"},
+				"C": {ActiveClusterName: "C1"},
+			}},
+			wantErr: true,
+		},
+	}
+	for _, tc := range tests {
+		validator := newAttrValidator(clusterMetadata, int32(1))
+		err := validator.checkActiveClusterRegionMappings(tc.activeClusters)
+		assert.Equal(t, tc.wantErr, err != nil)
+	}
+
+}
diff --git a/common/domain/handler.go b/common/domain/handler.go
@@ -1384,6 +1384,10 @@ func (d *handlerImpl) updateReplicationConfig(
 				d.logger.Debugf("Setting activeCluster for region %v to %v. no update case, just copy the existing active cluster", region, activeCluster)
 			}
 		}
+
+		// adjust failover versions so that same cluster in different regions have same failover versions
+		d.adjustFailoverVersions(finalActiveClusters)
+
 		config.ActiveClusters = &types.ActiveClusters{
 			ActiveClustersByRegion: finalActiveClusters,
 		}
@@ -1394,6 +1398,22 @@ func (d *handlerImpl) updateReplicationConfig(
 	return config, clusterUpdated, activeClusterUpdated, nil
 }
 
+func (d *handlerImpl) adjustFailoverVersions(activeClusters map[string]types.ActiveClusterInfo) {
+	clusterToRegions := make(map[string][]string)
+	clusterMaxFailoverVersion := make(map[string]int64)
+	for region, activeCluster := range activeClusters {
+		clusterToRegions[activeCluster.ActiveClusterName] = append(clusterToRegions[activeCluster.ActiveClusterName], region)
+		clusterMaxFailoverVersion[activeCluster.ActiveClusterName] = max(clusterMaxFailoverVersion[activeCluster.ActiveClusterName], activeCluster.FailoverVersion)
+	}
+	for cluster, regions := range clusterToRegions {
+		for _, region := range regions {
+			activeCluster := activeClusters[region]
+			activeCluster.FailoverVersion = clusterMaxFailoverVersion[cluster]
+			activeClusters[region] = activeCluster
+		}
+	}
+}
+
 func (d *handlerImpl) handleGracefulFailover(
 	updateRequest *types.UpdateDomainRequest,
 	replicationConfig *persistence.DomainReplicationConfig,
diff --git a/common/domain/handler_test.go b/common/domain/handler_test.go
@@ -1722,7 +1722,8 @@ func TestHandler_UpdateDomain(t *testing.T) {
 							ActiveClustersByRegion: map[string]types.ActiveClusterInfo{
 								cluster.TestRegion1: {
 									ActiveClusterName: cluster.TestCurrentClusterName,
-									FailoverVersion:   cluster.TestCurrentClusterInitialFailoverVersion,
+									// This is incremented to match below.
+									FailoverVersion: cluster.TestCurrentClusterInitialFailoverVersion + cluster.TestFailoverVersionIncrement,
 								},
 								cluster.TestRegion2: {
 									ActiveClusterName: cluster.TestCurrentClusterName,
@@ -1786,7 +1787,7 @@ func TestHandler_UpdateDomain(t *testing.T) {
 						ActiveClustersByRegion: map[string]types.ActiveClusterInfo{
 							cluster.TestRegion1: {
 								ActiveClusterName: cluster.TestCurrentClusterName,
-								FailoverVersion:   cluster.TestCurrentClusterInitialFailoverVersion,
+								FailoverVersion:   cluster.TestCurrentClusterInitialFailoverVersion + cluster.TestFailoverVersionIncrement,
 							},
 							cluster.TestRegion2: {
 								ActiveClusterName: cluster.TestCurrentClusterName,
@@ -1822,7 +1823,7 @@ func TestHandler_UpdateDomain(t *testing.T) {
 							ActiveClustersByRegion: map[string]types.ActiveClusterInfo{
 								cluster.TestRegion1: {
 									ActiveClusterName: cluster.TestCurrentClusterName,
-									FailoverVersion:   cluster.TestCurrentClusterInitialFailoverVersion,
+									FailoverVersion:   cluster.TestCurrentClusterInitialFailoverVersion + cluster.TestFailoverVersionIncrement,
 								},
 								cluster.TestRegion2: {
 									ActiveClusterName: cluster.TestCurrentClusterName,
@@ -2885,7 +2886,7 @@ func TestUpdateReplicationConfig(t *testing.T) {
 					ActiveClustersByRegion: map[string]types.ActiveClusterInfo{
 						cluster.TestRegion1: {
 							ActiveClusterName: cluster.TestCurrentClusterName,
-							FailoverVersion:   cluster.TestCurrentClusterInitialFailoverVersion,
+							FailoverVersion:   cluster.TestCurrentClusterInitialFailoverVersion + cluster.TestFailoverVersionIncrement,
 						},
 						cluster.TestRegion2: {
 							ActiveClusterName: cluster.TestCurrentClusterName,
diff --git a/config/dynamicconfig/replication_simulation_activeactive_regional_failover_start_same_wfid.yml b/config/dynamicconfig/replication_simulation_activeactive_regional_failover_start_same_wfid.yml
@@ -0,0 +1,22 @@
+# This file is used as dynamicconfig override for "activeactive_regional_failover_start_same_wfid" replication simulation scenario configured via simulation/replication/testdata/replication_simulation_activeactive_regional_failover_start_same_wfid.yaml
+system.writeVisibilityStoreName:
+  - value: "db"
+system.readVisibilityStoreName:
+  - value: "db"
+history.replicatorTaskBatchSize:
+  - value: 25
+    constraints: {}
+frontend.failoverCoolDown:
+  - value: 5s
+history.ReplicationTaskProcessorStartWait: # default is 5s. repl task processor sleeps this much before processing received messages.
+  - value: 10ms
+history.standbyTaskMissingEventsResendDelay:
+  - value: 5s
+history.standbyTaskMissingEventsDiscardDelay:
+  - value: 10s
+history.standbyClusterDelay:
+  - value: 10s
+history.enableTransferQueueV2:
+  - value: true
+history.enableTimerQueueV2:
+  - value: true
diff --git a/simulation/replication/replication_simulation_test.go b/simulation/replication/replication_simulation_test.go
@@ -153,6 +153,13 @@ func startWorkflow(
 		})
 
 	if err != nil {
+		if op.Want.Error != "" {
+			if strings.Contains(err.Error(), op.Want.Error) {
+				simTypes.Logf(t, "Start workflow got expected error: %s on domain: %s on cluster: %s. Error: %s", op.WorkflowID, op.Domain, op.Cluster, err.Error())
+				return nil
+			}
+			return fmt.Errorf("expected error: %s, but got: %s", op.Want.Error, err.Error())
+		}
 		return err
 	}
 
diff --git a/simulation/replication/testdata/replication_simulation_activeactive_regional_failover.yaml b/simulation/replication/testdata/replication_simulation_activeactive_regional_failover.yaml
@@ -45,9 +45,9 @@ operations:
       region0: cluster1 # this is changed from cluster0 to cluster1
       region1: cluster1
 
-  # Start wf2 on cluster0 at the same time as failover. It will be started by cluster0 workers and completed by cluster1 workers.
+  # Start wf2 on cluster0 right before failover. It will be started by cluster0 workers and completed by cluster1 workers.
   - op: start_workflow
-    at: 10s
+    at: 9s
     workflowID: wf2
     workflowType: timer-activity-loop-workflow
     cluster: cluster0
diff --git a/simulation/replication/testdata/replication_simulation_activeactive_regional_failover_start_same_wfid.yaml b/simulation/replication/testdata/replication_simulation_activeactive_regional_failover_start_same_wfid.yaml
@@ -0,0 +1,60 @@
+# This file is a replication simulation scenario spec.
+# It is parsed into ReplicationSimulationConfig struct.
+# Replication simulation for this file can be run via ./simulation/replication/run.sh activeactive_regional_failover_start_same_wfid
+# Dynamic config overrides can be set via config/dynamicconfig/replication_simulation_activeactive_regional_failover_start_same_wfid.yml
+clusters:
+  cluster0:
+    grpcEndpoint: "cadence-cluster0:7833"
+  cluster1:
+    grpcEndpoint: "cadence-cluster1:7833"
+
+# primaryCluster is where domain data is written to and replicates to others. e.g. domain registration
+primaryCluster: "cluster0"
+
+domains:
+  test-domain-aa:
+    activeClustersByRegion:
+      region0: cluster0
+      region1: cluster1
+
+operations:
+  # Start wf1 on cluster1 before failover.
+  - op: start_workflow
+    at: 0s
+    workflowID: wf1
+    workflowType: timer-activity-loop-workflow
+    cluster: cluster1
+    domain: test-domain-aa
+    workflowExecutionStartToCloseTimeout: 70s
+    workflowDuration: 60s
+
+
+  # Failover from cluster1 to cluster0
+  - op: change_active_clusters
+    at: 10s
+    domain: test-domain-aa
+    newActiveClustersByRegion:
+      region1: cluster0
+
+  # Attempt to start wf1 on cluster1 again. It will be forwarded to cluster0 and cluster0 should reject with error "Workflow execution is already running".
+  - op: start_workflow
+    at: 30s
+    workflowID: wf1
+    workflowType: timer-activity-loop-workflow
+    cluster: cluster1
+    domain: test-domain-aa
+    workflowExecutionStartToCloseTimeout: 70s
+    workflowDuration: 60s
+    want:
+      error: "Workflow execution is already running"
+
+  # Validate that wf1 is completed in cluster0.
+  - op: validate
+    at: 80s
+    workflowID: wf1
+    cluster: cluster1
+    domain: test-domain-aa
+    want:
+      status: completed
+      startedByWorkersInCluster: cluster1
+      completedByWorkersInCluster: cluster0