+ You can add a replica when creating a new Namespace or at any time on an existing one.
+ Both the Web UI and tcld CLI are supported.
+
+
+
+ {TABS.map((t) => (
+
+ ))}
+
+
+ {/* ── Create with HA ── */}
+ {tab === 'create' && (
+ <>
+
+
New Namespace
+
+ Specify both a primary region and a replica region when creating the Namespace.
+ Temporal Cloud immediately begins replicating all new Workflow history to the replica.
+
+ You can add a replica to an existing Namespace at any time. Temporal Cloud begins
+ asynchronously replicating all ongoing and historical Workflow
+ Executions to the new replica.
+
+
+
+
Via tcld
+ . \\
+ --region `}
+ />
+
+
Via Web UI
+
+
+
1
+
+
+ Go to your Namespace detail page in Temporal Cloud
+
+
+
+
+
2
+
+
+ Click "Add a replica"
+
+
+
+
+
3
+
+
+ Select the desired replica region and confirm
+
+
+ Replication of existing history begins immediately.
+
+
+
+
+
+
+ ℹ️
+
+ Changing replica location: Direct migration of a replica to a
+ different region is not supported. Remove the existing replica first, then add a new
+ one in the desired location. You must wait 7 days before re-enabling
+ HA in the same region after removal.
+
+
+ Removing a replica disables replication and reduces the Namespace back to a standard
+ single-isolation-domain configuration. The 7-day cooldown applies before you can
+ re-add a replica in the same location.
+
+
+
+
Via tcld
+ \\
+ --namespace . \\
+ --region `}
+ />
+
+
Via Web UI
+
+
+
1
+
+
+ Go to your Namespace detail page
+
+
+
+
+
2
+
+
+ Find the replica entry and click "Remove replica"
+
+
+
+
+
3
+
+
Confirm the removal
+
+ The Namespace reverts to standard configuration. SLA reverts to 99.9%.
+
+
+
+
+
+
+ ⚠️
+
+ 7-day cooldown: After removing a replica, you cannot re-enable HA
+ in the same location for 7 days. Plan replica migrations carefully — remove first,
+ add the new location, then remove the old one if needed.
+
+
+ >
+ )}
+
+
Monitoring Replication Health
+
+
+
Cloud UI
+
+ The Temporal Cloud UI displays replica status and health. When a replica becomes
+ unhealthy, failover options are automatically disabled to prevent cascading issues.
+
+
+
+
Metrics
+
+ Replication lag is emitted as pre-computed percentiles (p50, p95, p99) on the metric{' '}
+ temporal_cloud_v0_replication_lag_bucket, labeled with{' '}
+ temporal_namespace. Target: p95 < 1 minute.
+
+
+ Note: temporal_cloud_v1_total_action_count may appear doubled for HA
+ Namespaces — actions are replicated on both primary and replica.
+
+
+
+
+
+
+
+
+ );
+}
diff --git a/src/components/HADemo/FailoverSection.tsx b/src/components/HADemo/FailoverSection.tsx
new file mode 100644
index 0000000000..192b90ec80
--- /dev/null
+++ b/src/components/HADemo/FailoverSection.tsx
@@ -0,0 +1,292 @@
+import React, { useState } from 'react';
+import styles from './HADemo.module.css';
+
+type Props = { onNext: () => void };
+type FailoverMode = 'hybrid' | 'graceful' | 'forced';
+
+const MODES: { id: FailoverMode; label: string }[] = [
+ { id: 'hybrid', label: 'Hybrid (Default)' },
+ { id: 'graceful', label: 'Graceful' },
+ { id: 'forced', label: 'Forced' },
+];
+
+type StepDef = { text: string; note: string };
+
+const modeSteps: Record = {
+ hybrid: [
+ {
+ text: 'Temporal detects Primary unhealthy',
+ note: 'Health checks on error rates, latency, and infrastructure trigger the decision.',
+ },
+ {
+ text: 'Graceful handover attempted',
+ note: 'Primary stops accepting new writes. System waits up to 10 seconds for in-flight replication tasks to drain to the Replica.',
+ },
+ {
+ text: 'Forced failover if 10 s window expires',
+ note: 'If replication has not drained within 10 seconds, the system switches to a forced failover to prioritize availability.',
+ },
+ {
+ text: 'Replica promoted to active',
+ note: 'The Replica Namespace begins accepting Workflow writes. Existing and new Workers can start polling.',
+ },
+ {
+ text: 'DNS CNAME updated (~30 s convergence)',
+ note: '15-second TTL; most clients converge to the new active region within ~30 seconds.',
+ },
+ {
+ text: 'Conflict resolution on unreplicated events',
+ note: 'Highest-version history branch wins. Signals are re-injected. Some Activity progress may roll back and be retried.',
+ },
+ ],
+ graceful: [
+ {
+ text: 'Primary stops accepting new writes',
+ note: 'Clients will receive retryable "service unavailable" errors during the handover window.',
+ },
+ {
+ text: 'Wait for all replication tasks to drain',
+ note: 'The system waits until the Replica is fully caught up with the Primary. Default timeout is 10 seconds.',
+ },
+ {
+ text: 'Replica promoted to active',
+ note: 'Because all events are replicated before the switch, there is zero data loss (RPO = 0 for the events in-flight).',
+ },
+ {
+ text: 'Brief unavailability window (~10 s)',
+ note: 'Workflows pause during the handover. Temporal returns retryable errors. Workers and SDK retries handle this automatically.',
+ },
+ {
+ text: 'Full consistency guaranteed',
+ note: 'No conflict resolution needed — every event was replicated before the switch.',
+ },
+ ],
+ forced: [
+ {
+ text: 'Replica promoted immediately',
+ note: 'No waiting. The Replica becomes active right away, regardless of replication lag.',
+ },
+ {
+ text: 'Zero unavailability window',
+ note: 'The replica accepts traffic instantly. Use this when availability matters more than strict consistency.',
+ },
+ {
+ text: 'Unreplicated events undergo conflict resolution',
+ note: 'Events that did not reach the Replica before promotion are handled via conflict resolution. The highest-version branch is authoritative.',
+ },
+ {
+ text: 'Potential for some Workflow progress rollback',
+ note: 'Activity Executions that completed in the Primary but were not yet replicated will re-run. Idempotent Activities handle this safely.',
+ },
+ {
+ text: 'Signals are re-injected',
+ note: 'Temporal re-injects external events like Signals into the new history before discarding conflicting replication tasks.',
+ },
+ ],
+};
+
+const modeDescriptions: Record = {
+ hybrid: {
+ title: 'Hybrid Failover',
+ summary:
+ 'The default mode. Attempts a graceful handover first (up to 10 seconds), then automatically falls back to a forced failover if the window expires.',
+ tradeoff: 'Balances consistency and availability',
+ consistency: 'High — zero loss if drain succeeds, bounded loss otherwise',
+ availability: 'High — forced path kicks in if graceful stalls',
+ bestFor: 'Most production workloads',
+ },
+ graceful: {
+ title: 'Graceful Failover (Handover)',
+ summary:
+ 'Prioritizes consistency. The Primary stops writes and waits for the Replica to fully catch up before switching. Workflows see a brief unavailability window while the drain completes.',
+ tradeoff: 'Prioritizes consistency over availability',
+ consistency: 'Maximum — all in-flight events replicated before switch',
+ availability: 'Brief unavailability (~10 s) while draining',
+ bestFor: 'Financial or compliance workloads where data integrity is critical',
+ },
+ forced: {
+ title: 'Forced Failover',
+ summary:
+ 'Prioritizes availability. The Replica is promoted immediately with no drain window. Unreplicated events undergo conflict resolution after the switch.',
+ tradeoff: 'Prioritizes availability over consistency',
+ consistency: 'Lower — some Activity progress may roll back',
+ availability: 'Maximum — zero service interruption',
+ bestFor: 'Workloads where uptime is paramount and activities are idempotent',
+ },
+};
+
+export default function FailoverSection({ onNext }: Props) {
+ const [mode, setMode] = useState('hybrid');
+ const desc = modeDescriptions[mode];
+ const steps = modeSteps[mode];
+
+ return (
+
+
+
+
+
+
Failover Types
+
+ When Temporal Cloud detects that a Namespace is unhealthy, it triggers a failover. There
+ are three modes — each making a different trade-off between consistency and availability.
+
+
+
+ {MODES.map((m) => (
+
+ ))}
+
+
+
+
+ {desc.tradeoff}
+
+
{desc.title}
+
+ {desc.summary}
+
+
+
+
+
Consistency
+
+ {desc.consistency}
+
+
+
+
Availability
+
+ {desc.availability}
+
+
+
+
Best for
+
+ {desc.bestFor}
+
+
+
+
+
+ {steps.map((s, i) => (
+
+
{i + 1}
+
+
{s.text}
+
{s.note}
+
+
+ ))}
+
+
+
+
How to Trigger a Failover
+
+ Temporal-initiated automatic failover is enabled by default. You can also trigger one
+ manually via the UI, CLI, or API. Manual failovers follow standard failover procedures for
+ failback.
+
+
+
+
Web UI
+
+ Navigate to your Namespace detail page and click "Trigger a failover".
+ Select the target region and confirm.
+
+ POST to the FailoverNamespaceRegion endpoint via the Temporal Cloud Ops
+ API. The Terraform provider does not support failover triggering.
+
+
+
+
+
Automatic vs. Manual Failover
+
+
+
Temporal-initiated (default)
+
+ Temporal Cloud continuously monitors Namespace health and automatically triggers a
+ failover when thresholds are breached. Temporal strongly recommends keeping
+ automatic failover enabled. Disabling it means Temporal cannot guarantee RPO/RTO
+ objectives — your team becomes responsible for detecting and responding to failures.
+
+
+
+
Manual failover
+
+ You can disable Temporal-initiated failovers for full manual control. Trigger via UI,
+ CLI, or API on your own schedule.
+
+ Namespace endpoint unchanged — your gRPC address and credentials stay
+ the same after any failover.
+
+
+ Multi-region CNAME changes — the underlying CNAME target updates from
+ (e.g.) aws-us-west-1 to aws-us-east-1. DNS converges in ~30 s.
+ Same-region failovers are not affected.
+
+
+ Activities re-run — outstanding Activity Executions will time out and
+ be retried on the new primary. Make Activities idempotent.
+
+
+ Failback — Temporal automatically fails back after incident resolution
+ for auto-initiated failovers. Manual failovers require manually triggering the return
+ failover.
+
+
+ Replication lag before forced failover — monitor{' '}
+ temporal_cloud_v1_replication_lag_bucket before manually triggering a
+ forced failover to understand how much progress may roll back.
+
+ Step through the lifecycle of an HA Namespace — from normal async replication to an
+ end-to-end failover. Toggle between the two modes to see how each plays out.
+
+ Temporal Cloud High Availability gives every Namespace a synchronized replica — a standby
+ that can take over in under 20 minutes, with less than 1 minute of data loss, regardless of
+ whether an availability zone, an entire region, or a whole cloud provider fails.
+
+
+
+
+ The core idea in one sentence
+
+
+ Your Namespace asynchronously replicates all Workflow history to a replica in a separate
+ isolation domain — so if the primary fails, Temporal promotes the replica and your
+ Workflows keep running with minimal data loss.
+
+
+
+
The Problem HA Solves
+
+
+
Without HA
+
Single isolation domain
+
+ Standard Namespaces replicate across three availability zones in one region, handling
+ AZ-level failures automatically. But a cell-level bug, a regional outage, or a
+ cloud-provider incident can take your Namespace offline with no automatic recovery path.
+ Backups run every 4 hours — meaning hours of potential data loss.
+
+
+
+
With HA
+
Active primary + synchronized replica
+
+ Every Workflow Execution is asynchronously replicated to a replica in a separate
+ isolation domain. Temporal Cloud continuously monitors health and automatically fails
+ over to the replica — restoring service within the RTO window with sub-1-minute data
+ loss.
+
+ Temporal Cloud offers three replication models. Each protects against a different failure
+ scope. Choose based on what level of isolation your workload requires.
+
+
+ {/* 1: Same-region */}
+
+
+
1
+
+
+
+ Same-region Replication
+ Public Preview
+
+
Isolation domains within a single region
+
+ The Primary and Replica Namespaces live in separate isolation domains (cells)
+ within the same region. Each cell is an independent deployment of the Temporal
+ service — different databases, different compute, different failure blast radius.
+
+
+ This protects against cell-level software bugs or sub-component database failures that
+ survive all three availability zones in a single cell. Because the replica stays in the
+ same region, Workers don't need multi-region connectivity and latency impact is minimal.
+
+
+ Primary cell: aws-us-east-1 (cell A)
+ Replica cell: aws-us-east-1 (cell B)
+ // same region, separate isolation domains
+
+
+
+
+ {/* 2: Multi-region */}
+
+
+
2
+
+
+
+ Multi-region Replication
+ General Availability
+
+
Isolation domains in separate regions of the same cloud
+
+ The Primary and Replica Namespaces run in different geographic regions of
+ the same cloud provider (e.g. AWS us-east-1 and AWS us-west-2). This protects against
+ everything same-region covers, plus full region-wide incidents.
+
+
+ During failover, the Namespace CNAME record updates to the new active region. Workers
+ deployed in the replica region resume processing with near-zero latency. Workers only
+ in the primary region will experience cross-region latency until they are relocated or
+ replicas are deployed.
+
Isolation domains across different cloud providers
+
+ The Primary and Replica run on entirely different cloud providers (e.g.
+ AWS and GCP). Traffic automatically shifts to the replica if the entire primary cloud
+ provider experiences a widespread outage. This is the broadest failure protection
+ available.
+
+
+ Multi-cloud replication carries the highest operational complexity, including managing
+ worker fleets on two cloud providers and routing inter-cloud network traffic. Use this
+ when contractual obligations require cloud-provider-level redundancy.
+
+ For multi-region and multi-cloud replication, Temporal strongly recommends deploying
+ Workers in both the primary and replica regions. Workers in the primary region will
+ experience cross-region latency after failover until they are updated to point at the new
+ active region. Workers already running in the replica region pick up tasks immediately.
+
+ );
+}
diff --git a/src/components/HADemo/quizQuestions.ts b/src/components/HADemo/quizQuestions.ts
new file mode 100644
index 0000000000..520023dd18
--- /dev/null
+++ b/src/components/HADemo/quizQuestions.ts
@@ -0,0 +1,66 @@
+export type QuizQuestion = {
+ q: string;
+ options: readonly string[];
+ correct: number;
+ explanation: string;
+};
+
+export const quizQuestions: QuizQuestion[] = [
+ {
+ q: 'What does RPO (Recovery Point Objective) measure?',
+ options: [
+ 'How fast a system recovers after an outage',
+ 'The maximum acceptable amount of data loss measured in time',
+ 'The number of replicas required for HA',
+ 'The replication throughput in events per second',
+ ],
+ correct: 1,
+ explanation:
+ 'RPO measures how much data could be lost — "how far back in time could we roll back?" Temporal Cloud HA targets a sub-1-minute RPO, meaning at most ~1 minute of Workflow history could be lost during a regional failover.',
+ },
+ {
+ q: "What contractual SLA does Temporal Cloud's High Availability tier provide?",
+ options: ['99.9%', '99.95%', '99.99%', '100%'],
+ correct: 2,
+ explanation:
+ 'HA Namespaces carry a 99.99% uptime SLA. Standard Namespaces receive 99.9%. The extra nine is achieved through a synchronized replica that can take over within the RTO window.',
+ },
+ {
+ q: 'What is the RTO (Recovery Time Objective) for a High Availability Namespace?',
+ options: ['Instant (zero)', 'Under 1 minute', '20 minutes', '1 hour'],
+ correct: 2,
+ explanation:
+ 'Temporal Cloud HA targets a 20-minute RTO — the maximum time to restore service after a cell or regional failure. The RPO is sub-1-minute (data loss), while RTO covers the recovery duration.',
+ },
+ {
+ q: 'Which failover mode does Temporal Cloud use by default?',
+ options: [
+ 'Graceful (always waits for replication to drain)',
+ 'Forced (immediately promotes replica)',
+ 'Hybrid (tries graceful for 10 s, then forces)',
+ 'Manual only (no automatic failover)',
+ ],
+ correct: 2,
+ explanation:
+ 'Hybrid is the default. It first attempts a graceful handover (up to 10 seconds) to drain in-flight replication tasks, then automatically switches to a forced failover if the graceful window expires — balancing consistency and availability.',
+ },
+ {
+ q: 'What outage type does same-region replication protect against, but standard 3-zone replication does NOT?',
+ options: [
+ 'Individual availability zone failures',
+ 'Cell or sub-component failures within a region',
+ 'Complete AWS or GCP cloud-provider outages',
+ 'Network packet loss between workers',
+ ],
+ correct: 1,
+ explanation:
+ 'Standard Namespaces replicate across three availability zones and handle AZ failures. Same-region HA adds a second isolation domain (cell), protecting against cell-level software bugs or database failures that survive across all three zones.',
+ },
+ {
+ q: 'After removing a replica from a High Availability Namespace, how long must you wait before re-enabling HA in the same region?',
+ options: ['1 hour', '24 hours', '3 days', '7 days'],
+ correct: 3,
+ explanation:
+ 'Temporal enforces a 7-day cooldown before you can re-enable replication in the same location after removing it. This prevents rapid cycling that could create inconsistent state during the replica rebuild period.',
+ },
+];
diff --git a/src/components/HADemo/replicationSteps.ts b/src/components/HADemo/replicationSteps.ts
new file mode 100644
index 0000000000..bc5e7c0712
--- /dev/null
+++ b/src/components/HADemo/replicationSteps.ts
@@ -0,0 +1,128 @@
+export type FlowMode = 'replication' | 'failover';
+export type LogLevel = 'info' | 'success' | 'warn';
+
+export type FlowStep = {
+ label: string;
+ detail: string;
+ /** 0 = left node, 1 = middle node, 2 = right node */
+ activeNode: 0 | 1 | 2;
+ /** Progress along the tracks: 0–100 */
+ packetPct: number;
+ log: { level: LogLevel; event: string; msg: string };
+};
+
+export type NodeDef = { title: string; sub: string };
+
+export const replicationNodes: NodeDef[] = [
+ { title: 'Primary Namespace', sub: 'Active · writes accepted' },
+ { title: 'Replication Layer', sub: 'Async · background' },
+ { title: 'Replica Namespace', sub: 'Standby · read-only' },
+];
+
+export const failoverNodes: NodeDef[] = [
+ { title: 'Primary Namespace', sub: 'Failing · degraded' },
+ { title: 'HA Engine', sub: 'Detection · coordination' },
+ { title: 'Replica Namespace', sub: 'Promoting → Active' },
+];
+
+export const replicationSteps: FlowStep[] = [
+ {
+ label: 'Workflow event written to Primary',
+ detail:
+ 'A Workflow Execution event (e.g. WorkflowExecutionStarted) is durably recorded in the Primary Namespace and acknowledged to the client. Replication happens independently in the background.',
+ activeNode: 0,
+ packetPct: 0,
+ log: { level: 'info', event: 'WRITE', msg: 'Event persisted in Primary: WorkflowExecutionStarted' },
+ },
+ {
+ label: 'Replication task queued',
+ detail:
+ 'The replication engine queues an asynchronous task to copy the new History Event to the Replica. Client latency is unaffected — the client already received its acknowledgment.',
+ activeNode: 0,
+ packetPct: 20,
+ log: { level: 'info', event: 'QUEUE', msg: 'Replication task enqueued for Replica Namespace' },
+ },
+ {
+ label: 'Event transmitted to Replica',
+ detail:
+ 'The Replication Layer transmits the History Event and its version stamp to the Replica Namespace. Temporal Cloud targets a P95 replication lag of under 1 minute.',
+ activeNode: 1,
+ packetPct: 50,
+ log: { level: 'info', event: 'TRANSMIT', msg: 'Replication in-flight → Replica (p95 < 1 min lag)' },
+ },
+ {
+ label: 'Replica persists the event',
+ detail:
+ 'The Replica Namespace persists the replicated event across its own availability zones and stores the version history entry used for conflict resolution during failover.',
+ activeNode: 2,
+ packetPct: 100,
+ log: { level: 'success', event: 'PERSIST', msg: 'Replica wrote event with version stamp' },
+ },
+ {
+ label: 'Replica ready for failover',
+ detail:
+ 'The Replica is synchronized and healthy. All Visibility APIs remain queryable against both Primary and Replica. The system is ready to fail over with sub-1-minute data loss if needed.',
+ activeNode: 2,
+ packetPct: 100,
+ log: { level: 'success', event: 'READY', msg: 'Replica health OK — RPO target achievable' },
+ },
+];
+
+export const failoverSteps: FlowStep[] = [
+ {
+ label: 'Health check detects degradation',
+ detail:
+ 'Temporal Cloud monitors the Primary Namespace continuously. When error rates, latencies, or infrastructure checks exceed failure thresholds, the HA Engine triggers a failover.',
+ activeNode: 0,
+ packetPct: 0,
+ log: { level: 'warn', event: 'HEALTH', msg: 'Primary health check failed — thresholds exceeded' },
+ },
+ {
+ label: 'Hybrid failover initiated (10 s window)',
+ detail:
+ 'The default Hybrid mode first attempts a graceful handover: the Primary stops accepting new writes and waits up to 10 seconds for in-flight replication tasks to drain to the Replica.',
+ activeNode: 1,
+ packetPct: 50,
+ log: { level: 'warn', event: 'FAILOVER', msg: 'Hybrid: graceful handover attempted (10 s timeout)' },
+ },
+ {
+ label: 'Replica promoted to active',
+ detail:
+ 'After the graceful window (or immediately in forced mode), the Replica Namespace is promoted to active. It begins accepting Workflow traffic. The old Primary enters standby mode.',
+ activeNode: 2,
+ packetPct: 100,
+ log: { level: 'success', event: 'PROMOTE', msg: 'Replica promoted — now accepting writes' },
+ },
+ {
+ label: 'DNS CNAME updated',
+ detail:
+ 'The Namespace CNAME record is updated to point to the new active region (15 s TTL). Clients and Workers begin resolving to the new active endpoint within approximately 30 seconds.',
+ activeNode: 2,
+ packetPct: 100,
+ log: { level: 'info', event: 'DNS', msg: 'CNAME updated → new region (15 s TTL, ~30 s convergence)' },
+ },
+ {
+ label: 'Conflict resolution applied',
+ detail:
+ 'Any events that were in-flight and not yet replicated undergo conflict resolution. The history branch with the highest Namespace version is authoritative. Signals are re-injected; some Activity progress may roll back.',
+ activeNode: 1,
+ packetPct: 50,
+ log: { level: 'info', event: 'RESOLVE', msg: 'Conflict resolution: highest-version branch wins' },
+ },
+ {
+ label: 'Old Primary demoted to Replica',
+ detail:
+ 'The original Primary becomes the new standby Replica. Outstanding Activities will time out and be retried on the new active Namespace — handle these like any Worker restart.',
+ activeNode: 0,
+ packetPct: 0,
+ log: { level: 'info', event: 'DEMOTE', msg: 'Original Primary is now Replica (standby)' },
+ },
+ {
+ label: 'Failover complete',
+ detail:
+ 'Traffic is fully shifted. Workflow Executions continue from their last replicated state. Your Namespace endpoint URL and credentials are unchanged — no client updates required.',
+ activeNode: 2,
+ packetPct: 100,
+ log: { level: 'success', event: 'COMPLETE', msg: 'Failover complete — RTO target: 20 min' },
+ },
+];