diff --git a/docs/ha-demo.mdx b/docs/ha-demo.mdx new file mode 100644 index 0000000000..2addc54b8d --- /dev/null +++ b/docs/ha-demo.mdx @@ -0,0 +1,21 @@ +--- +id: ha-demo +title: Temporal High Availability - Interactive Demo +sidebar_label: High Availability +description: An interactive walkthrough of Temporal Cloud High Availability — what it is, how replication works, and how to enable it. +slug: /ha-demo +tags: + - High Availability + - Cloud +keywords: + - temporal high availability demo + - same-region replication + - multi-region replication + - failover + - RPO RTO +hide_table_of_contents: true +--- + +import HADemo from '@site/src/components/HADemo'; + + diff --git a/sidebars.js b/sidebars.js index d5944d13fb..35813d8fe3 100644 --- a/sidebars.js +++ b/sidebars.js @@ -890,6 +890,15 @@ module.exports = { ], }, 'glossary', + { + type: 'category', + label: 'Interactive Demos', + collapsed: true, + items: [ + // 'nexus-demo', // added in PR #4332 + 'ha-demo', + ], + }, 'with-ai', // { // type: "autogenerated", diff --git a/src/components/HADemo/EnableIt.tsx b/src/components/HADemo/EnableIt.tsx new file mode 100644 index 0000000000..083d0920b8 --- /dev/null +++ b/src/components/HADemo/EnableIt.tsx @@ -0,0 +1,264 @@ +import React, { useState } from 'react'; +import styles from './HADemo.module.css'; + +type Props = { onNext: () => void }; +type Tab = 'create' | 'add' | 'remove'; + +const TABS: { id: Tab; label: string }[] = [ + { id: 'create', label: 'Create with HA' }, + { id: 'add', label: 'Add replica to existing' }, + { id: 'remove', label: 'Remove replica' }, +]; + +function Cmd({ prompt, cmd }: { prompt: string; cmd: string }) { + return ( +
+ {prompt}  + {cmd} +
+ ); +} + +export default function EnableIt({ onNext }: Props) { + const [tab, setTab] = useState('create'); + + return ( +
+
+
+
+ +

Enable High Availability

+

+ You can add a replica when creating a new Namespace or at any time on an existing one. + Both the Web UI and tcld CLI are supported. +

+ +
+ {TABS.map((t) => ( + + ))} +
+ + {/* ── Create with HA ── */} + {tab === 'create' && ( + <> +
+
New Namespace
+

+ Specify both a primary region and a replica region when creating the Namespace. + Temporal Cloud immediately begins replicating all new Workflow history to the replica. +

+
+ +
Via tcld
+ . \\ + --region \\ + --region `} + /> +
+ # Example (multi-region, GA): + # --region aws-us-east-1 --region aws-us-west-2 +
+ +
Via Web UI
+
+
+
1
+
+
Navigate to Temporal Cloud and click Create Namespace
+
+
+
+
2
+
+
Select the primary region
+
e.g. AWS us-east-1
+
+
+
+
3
+
+
Click "Add a replica" and choose the replica region
+
+ Same-region and multi-cloud are in Public Preview; multi-region is GA. +
+
+
+
+
4
+
+
Confirm and create
+
+ Replication begins immediately for all new and existing Workflow Executions. +
+
+
+
+ + )} + + {/* ── Add to existing ── */} + {tab === 'add' && ( + <> +
+
Existing Namespace
+

+ You can add a replica to an existing Namespace at any time. Temporal Cloud begins + asynchronously replicating all ongoing and historical Workflow + Executions to the new replica. +

+
+ +
Via tcld
+ . \\ + --region `} + /> + +
Via Web UI
+
+
+
1
+
+
+ Go to your Namespace detail page in Temporal Cloud +
+
+
+
+
2
+
+
+ Click "Add a replica" +
+
+
+
+
3
+
+
+ Select the desired replica region and confirm +
+
+ Replication of existing history begins immediately. +
+
+
+
+ +
+ ℹ️ + + Changing replica location: Direct migration of a replica to a + different region is not supported. Remove the existing replica first, then add a new + one in the desired location. You must wait 7 days before re-enabling + HA in the same region after removal. + +
+ + )} + + {/* ── Remove replica ── */} + {tab === 'remove' && ( + <> +
+
Disable HA
+

+ Removing a replica disables replication and reduces the Namespace back to a standard + single-isolation-domain configuration. The 7-day cooldown applies before you can + re-add a replica in the same location. +

+
+ +
Via tcld
+ \\ + --namespace . \\ + --region `} + /> + +
Via Web UI
+
+
+
1
+
+
+ Go to your Namespace detail page +
+
+
+
+
2
+
+
+ Find the replica entry and click "Remove replica" +
+
+
+
+
3
+
+
Confirm the removal
+
+ The Namespace reverts to standard configuration. SLA reverts to 99.9%. +
+
+
+
+ +
+ ⚠️ + + 7-day cooldown: After removing a replica, you cannot re-enable HA + in the same location for 7 days. Plan replica migrations carefully — remove first, + add the new location, then remove the old one if needed. + +
+ + )} + +

Monitoring Replication Health

+
+
+
Cloud UI
+

+ The Temporal Cloud UI displays replica status and health. When a replica becomes + unhealthy, failover options are automatically disabled to prevent cascading issues. +

+
+
+
Metrics
+

+ Replication lag is emitted as pre-computed percentiles (p50, p95, p99) on the metric{' '} + temporal_cloud_v0_replication_lag_bucket, labeled with{' '} + temporal_namespace. Target: p95 < 1 minute. +

+

+ Note: temporal_cloud_v1_total_action_count may appear doubled for HA + Namespaces — actions are replicated on both primary and replica. +

+
+
+ +
+ +
+
+ ); +} diff --git a/src/components/HADemo/FailoverSection.tsx b/src/components/HADemo/FailoverSection.tsx new file mode 100644 index 0000000000..192b90ec80 --- /dev/null +++ b/src/components/HADemo/FailoverSection.tsx @@ -0,0 +1,292 @@ +import React, { useState } from 'react'; +import styles from './HADemo.module.css'; + +type Props = { onNext: () => void }; +type FailoverMode = 'hybrid' | 'graceful' | 'forced'; + +const MODES: { id: FailoverMode; label: string }[] = [ + { id: 'hybrid', label: 'Hybrid (Default)' }, + { id: 'graceful', label: 'Graceful' }, + { id: 'forced', label: 'Forced' }, +]; + +type StepDef = { text: string; note: string }; + +const modeSteps: Record = { + hybrid: [ + { + text: 'Temporal detects Primary unhealthy', + note: 'Health checks on error rates, latency, and infrastructure trigger the decision.', + }, + { + text: 'Graceful handover attempted', + note: 'Primary stops accepting new writes. System waits up to 10 seconds for in-flight replication tasks to drain to the Replica.', + }, + { + text: 'Forced failover if 10 s window expires', + note: 'If replication has not drained within 10 seconds, the system switches to a forced failover to prioritize availability.', + }, + { + text: 'Replica promoted to active', + note: 'The Replica Namespace begins accepting Workflow writes. Existing and new Workers can start polling.', + }, + { + text: 'DNS CNAME updated (~30 s convergence)', + note: '15-second TTL; most clients converge to the new active region within ~30 seconds.', + }, + { + text: 'Conflict resolution on unreplicated events', + note: 'Highest-version history branch wins. Signals are re-injected. Some Activity progress may roll back and be retried.', + }, + ], + graceful: [ + { + text: 'Primary stops accepting new writes', + note: 'Clients will receive retryable "service unavailable" errors during the handover window.', + }, + { + text: 'Wait for all replication tasks to drain', + note: 'The system waits until the Replica is fully caught up with the Primary. Default timeout is 10 seconds.', + }, + { + text: 'Replica promoted to active', + note: 'Because all events are replicated before the switch, there is zero data loss (RPO = 0 for the events in-flight).', + }, + { + text: 'Brief unavailability window (~10 s)', + note: 'Workflows pause during the handover. Temporal returns retryable errors. Workers and SDK retries handle this automatically.', + }, + { + text: 'Full consistency guaranteed', + note: 'No conflict resolution needed — every event was replicated before the switch.', + }, + ], + forced: [ + { + text: 'Replica promoted immediately', + note: 'No waiting. The Replica becomes active right away, regardless of replication lag.', + }, + { + text: 'Zero unavailability window', + note: 'The replica accepts traffic instantly. Use this when availability matters more than strict consistency.', + }, + { + text: 'Unreplicated events undergo conflict resolution', + note: 'Events that did not reach the Replica before promotion are handled via conflict resolution. The highest-version branch is authoritative.', + }, + { + text: 'Potential for some Workflow progress rollback', + note: 'Activity Executions that completed in the Primary but were not yet replicated will re-run. Idempotent Activities handle this safely.', + }, + { + text: 'Signals are re-injected', + note: 'Temporal re-injects external events like Signals into the new history before discarding conflicting replication tasks.', + }, + ], +}; + +const modeDescriptions: Record = { + hybrid: { + title: 'Hybrid Failover', + summary: + 'The default mode. Attempts a graceful handover first (up to 10 seconds), then automatically falls back to a forced failover if the window expires.', + tradeoff: 'Balances consistency and availability', + consistency: 'High — zero loss if drain succeeds, bounded loss otherwise', + availability: 'High — forced path kicks in if graceful stalls', + bestFor: 'Most production workloads', + }, + graceful: { + title: 'Graceful Failover (Handover)', + summary: + 'Prioritizes consistency. The Primary stops writes and waits for the Replica to fully catch up before switching. Workflows see a brief unavailability window while the drain completes.', + tradeoff: 'Prioritizes consistency over availability', + consistency: 'Maximum — all in-flight events replicated before switch', + availability: 'Brief unavailability (~10 s) while draining', + bestFor: 'Financial or compliance workloads where data integrity is critical', + }, + forced: { + title: 'Forced Failover', + summary: + 'Prioritizes availability. The Replica is promoted immediately with no drain window. Unreplicated events undergo conflict resolution after the switch.', + tradeoff: 'Prioritizes availability over consistency', + consistency: 'Lower — some Activity progress may roll back', + availability: 'Maximum — zero service interruption', + bestFor: 'Workloads where uptime is paramount and activities are idempotent', + }, +}; + +export default function FailoverSection({ onNext }: Props) { + const [mode, setMode] = useState('hybrid'); + const desc = modeDescriptions[mode]; + const steps = modeSteps[mode]; + + return ( +
+
+
+
+ +

Failover Types

+

+ When Temporal Cloud detects that a Namespace is unhealthy, it triggers a failover. There + are three modes — each making a different trade-off between consistency and availability. +

+ +
+ {MODES.map((m) => ( + + ))} +
+ +
+
+ {desc.tradeoff} +
+

{desc.title}

+

+ {desc.summary} +

+ +
+
+
Consistency
+
+ {desc.consistency} +
+
+
+
Availability
+
+ {desc.availability} +
+
+
+
Best for
+
+ {desc.bestFor} +
+
+
+ +
+ {steps.map((s, i) => ( +
+
{i + 1}
+
+
{s.text}
+
{s.note}
+
+
+ ))} +
+
+ +

How to Trigger a Failover

+

+ Temporal-initiated automatic failover is enabled by default. You can also trigger one + manually via the UI, CLI, or API. Manual failovers follow standard failover procedures for + failback. +

+
+
+
Web UI
+

+ Navigate to your Namespace detail page and click "Trigger a failover". + Select the target region and confirm. +

+
+
+
tcld CLI
+
+ + + {'tcld namespace failover \\\n --namespace . \\\n --region '} + +
+
+
+
Cloud Ops API
+

+ POST to the FailoverNamespaceRegion endpoint via the Temporal Cloud Ops + API. The Terraform provider does not support failover triggering. +

+
+
+ +

Automatic vs. Manual Failover

+
+
+
Temporal-initiated (default)
+

+ Temporal Cloud continuously monitors Namespace health and automatically triggers a + failover when thresholds are breached. Temporal strongly recommends keeping + automatic failover enabled. Disabling it means Temporal cannot guarantee RPO/RTO + objectives — your team becomes responsible for detecting and responding to failures. +

+
+
+
Manual failover
+

+ You can disable Temporal-initiated failovers for full manual control. Trigger via UI, + CLI, or API on your own schedule. +

+
+ + + {'tcld namespace update-high-availability \\\n --namespace . \\\n --disable-auto-failover=true'} + +
+
+
+ +

Post-failover Considerations

+
+
    +
  • + Namespace endpoint unchanged — your gRPC address and credentials stay + the same after any failover. +
  • +
  • + Multi-region CNAME changes — the underlying CNAME target updates from + (e.g.) aws-us-west-1 to aws-us-east-1. DNS converges in ~30 s. + Same-region failovers are not affected. +
  • +
  • + Activities re-run — outstanding Activity Executions will time out and + be retried on the new primary. Make Activities idempotent. +
  • +
  • + Failback — Temporal automatically fails back after incident resolution + for auto-initiated failovers. Manual failovers require manually triggering the return + failover. +
  • +
  • + Replication lag before forced failover — monitor{' '} + temporal_cloud_v1_replication_lag_bucket before manually triggering a + forced failover to understand how much progress may roll back. +
  • +
+
+ +
+ +
+
+ ); +} diff --git a/src/components/HADemo/HADemo.module.css b/src/components/HADemo/HADemo.module.css new file mode 100644 index 0000000000..9cfbd73f9a --- /dev/null +++ b/src/components/HADemo/HADemo.module.css @@ -0,0 +1,1095 @@ +/* + * HADemo.module.css + * Uses Docusaurus IFM variables for automatic light/dark mode support. + * Demo-specific accent colors are declared per-theme below. + */ + +/* ── Per-theme accent vars ─────────────────────────────── */ +:global([data-theme='dark']) { + --hd-surface: #1e1e1e; + --hd-surface2: #252525; + --hd-accent-text: #d4d7ff; + --hd-num-badge-bg: #7F86F1; + --hd-border: rgba(255, 255, 255, 0.1); + --hd-muted: #94a3b8; + --hd-nav-inactive: #94a3b8; + --hd-nav-hover: #d4d7ff; + --hd-nav-active: #7F86F1; + --hd-progress-fill: #7F86F1; + --hd-purple: #a78bfa; + --hd-purple-bg: rgba(167, 139, 250, 0.12); + --hd-green: #34d399; + --hd-green-bg: rgba(52, 211, 153, 0.1); + --hd-red: #f87171; + --hd-red-bg: rgba(248, 113, 113, 0.1); + --hd-orange: #f97316; + --hd-orange-bg: rgba(249, 115, 22, 0.15); + --hd-amber: #fbbf24; + --hd-amber-bg: rgba(251, 191, 36, 0.1); + --hd-primary-bg: rgba(191, 219, 254, 0.1); + --hd-uv: #444CE7; + --hd-uv-bg: rgba(68, 76, 231, 0.18); + --hd-teal: #1FF1A5; + --hd-teal-bg: rgba(31, 241, 165, 0.15); + --hd-card-bg: rgba(31, 32, 63, 0.6); + --hd-card-border: rgba(127, 134, 241, 0.15); + --hd-primary-node-border: #7F86F1; + --hd-replica-node-border: #34d399; + --hd-failing-node-border: #f87171; +} + +:global([data-theme='light']) { + --hd-surface: #ffffff; + --hd-surface2: #f1f5f9; + --hd-accent-text: #444CE7; + --hd-num-badge-bg: #444CE7; + --hd-border: rgba(0, 0, 0, 0.08); + --hd-muted: #64748b; + --hd-nav-inactive: #64748b; + --hd-nav-hover: #444CE7; + --hd-nav-active: #444CE7; + --hd-progress-fill: #444CE7; + --hd-purple: #7c3aed; + --hd-purple-bg: rgba(124, 58, 237, 0.08); + --hd-green: #059669; + --hd-green-bg: rgba(5, 150, 105, 0.08); + --hd-red: #dc2626; + --hd-red-bg: rgba(220, 38, 38, 0.08); + --hd-orange: #ea580c; + --hd-orange-bg: rgba(234, 88, 12, 0.1); + --hd-amber: #d97706; + --hd-amber-bg: rgba(217, 119, 6, 0.08); + --hd-primary-bg: rgba(29, 78, 216, 0.06); + --hd-uv: #3a41cc; + --hd-uv-bg: rgba(58, 65, 204, 0.1); + --hd-teal: #059669; + --hd-teal-bg: rgba(5, 150, 105, 0.1); + --hd-card-bg: #ffffff; + --hd-card-border: rgba(0, 0, 0, 0.09); + --hd-primary-node-border: #444CE7; + --hd-replica-node-border: #059669; + --hd-failing-node-border: #dc2626; +} + +/* ── Shell ─────────────────────────────────────────────── */ +.shell { + font-family: var(--ifm-font-family-base); + color: var(--ifm-font-color-base); + background: var(--ifm-background-color); + min-height: 100vh; +} + +:global([data-theme='light']) .shell { + background: #f4f6f9; +} + +:global([data-theme='light']) .card { + background: #ffffff; +} + +/* ── Nav ───────────────────────────────────────────────── */ +.nav { + position: sticky; + top: var(--ifm-navbar-height); + z-index: 50; + background: var(--ifm-background-color); + border-bottom: 1px solid var(--hd-border); + display: flex; + align-items: center; + gap: 4px; + padding: 0 24px; + overflow-x: auto; + scrollbar-width: none; +} + +.nav::-webkit-scrollbar { + display: none; +} + +.navLogo { + font-weight: 700; + font-size: 14px; + color: var(--ifm-color-primary); + margin-right: 12px; + white-space: nowrap; + flex-shrink: 0; +} + +.navBtn { + background: none; + border: none; + border-bottom: 2px solid transparent; + color: var(--hd-nav-inactive); + cursor: pointer; + font-size: 13px; + font-family: var(--ifm-font-family-base); + padding: 12px 14px; + white-space: nowrap; + transition: color 0.15s, border-color 0.15s; + flex-shrink: 0; + margin-bottom: -1px; +} + +.navBtn:hover { + color: var(--hd-nav-hover); +} + +.navBtnActive { + color: var(--hd-nav-active); + border-bottom-color: var(--hd-nav-active); +} + +/* ── Sections ──────────────────────────────────────────── */ +.section { + padding: 40px 24px 64px; + max-width: 960px; + margin: 0 auto; + animation: fadeUp 0.25s ease forwards; +} + +@keyframes fadeUp { + from { + opacity: 0; + transform: translateY(6px); + } + to { + opacity: 1; + transform: none; + } +} + +/* ── Progress bar ──────────────────────────────────────── */ +.progressBar { + height: 3px; + background: var(--hd-border); + border-radius: 0; + margin-bottom: 36px; + overflow: hidden; +} + +.progressFill { + height: 100%; + background: var(--hd-progress-fill); + border-radius: 0; + transition: width 0.4s ease; +} + +/* ── Typography ────────────────────────────────────────── */ +.lead { + font-size: 16px; + color: var(--ifm-font-color-base); + max-width: 680px; + margin-bottom: 32px; + line-height: 1.7; +} + +.sectionHeading { + font-size: 20px; + font-weight: 600; + color: var(--ifm-font-color-base); + margin: 32px 0 16px; +} + +/* ── Status badges ─────────────────────────────────────── */ +.statusBadge { + display: inline-block; + font-size: 10px; + font-weight: 700; + padding: 2px 7px; + border-radius: 9999px; + text-transform: uppercase; + letter-spacing: 0.5px; + margin-left: 8px; + vertical-align: middle; +} + +.statusBadgeGA { + background: var(--hd-green-bg); + color: var(--hd-green); +} + +.statusBadgePreview { + background: var(--hd-amber-bg); + color: var(--hd-amber); +} + +/* ── Cards ─────────────────────────────────────────────── */ +.card { + background: var(--hd-surface); + border: 1px solid var(--hd-border); + border-radius: 0; + padding: 20px; + margin-bottom: 16px; +} + +.cardGrid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 16px; + margin-bottom: 32px; +} + +.cardGrid .card { + margin-bottom: 0; +} + +/* ── Tags ──────────────────────────────────────────────── */ +.tag { + display: inline-block; + font-size: 11px; + font-weight: 600; + padding: 2px 8px; + border-radius: 0; + margin-bottom: 8px; + text-transform: uppercase; + letter-spacing: 0.5px; +} + +.tagBlue { + background: var(--hd-primary-bg); + color: var(--ifm-color-primary); +} + +.tagPurple { + background: var(--hd-purple-bg); + color: var(--hd-purple); +} + +.tagGreen { + background: var(--hd-green-bg); + color: var(--hd-green); +} + +.tagRed { + background: var(--hd-red-bg); + color: var(--hd-red); +} + +.tagAmber { + background: var(--hd-amber-bg); + color: var(--hd-amber); +} + +.tagOrange { + background: var(--hd-orange-bg); + color: var(--hd-orange); +} + +.tagTeal { + background: var(--hd-teal-bg); + color: var(--hd-teal); +} + +/* ── Buttons ───────────────────────────────────────────── */ +.btn { + display: inline-flex; + align-items: center; + gap: 6px; + background: var(--ifm-color-primary); + color: #fff; + border: none; + border-radius: 0; + padding: 10px 20px; + font-size: 14px; + font-family: var(--ifm-font-family-base); + font-weight: 600; + cursor: pointer; + transition: opacity 0.15s; +} + +.btn:hover { + opacity: 0.88; + color: #fff; +} + +.btn:disabled { + opacity: 0.35; + cursor: not-allowed; +} + +.btnSecondary { + background: var(--hd-surface); + color: var(--ifm-font-color-base); + border: 1px solid var(--hd-border); +} + +.btnSecondary:hover { + opacity: 1; + border-color: var(--ifm-color-primary); + color: var(--ifm-color-primary); +} + +.nextRow { + margin-top: 40px; + display: flex; + justify-content: flex-end; +} + +/* ── Overview: namespace diagram ───────────────────────── */ +.haDiagram { + display: flex; + gap: 0; + align-items: stretch; + margin: 24px 0 32px; + flex-wrap: wrap; + gap: 16px; +} + +.haNode { + flex: 1; + min-width: 200px; + background: var(--hd-surface); + border: 2px solid var(--hd-border); + border-radius: 0; + padding: 16px; +} + +.haNodePrimary { + border-color: var(--hd-primary-node-border); +} + +.haNodeReplica { + border-color: var(--hd-replica-node-border); +} + +.haNodeLabel { + font-size: 11px; + font-weight: 700; + text-transform: uppercase; + letter-spacing: 1px; + padding: 3px 10px; + border-radius: 0; + display: inline-block; + margin-bottom: 12px; +} + +.haNodeLabelPrimary { + background: var(--hd-primary-bg); + color: var(--ifm-color-primary); +} + +.haNodeLabelReplica { + background: var(--hd-green-bg); + color: var(--hd-green); +} + +.haArrow { + display: flex; + align-items: center; + justify-content: center; + flex-direction: column; + gap: 4px; + flex-shrink: 0; + color: var(--hd-muted); + min-width: 80px; +} + +.haArrowLabel { + font-size: 11px; + text-align: center; + color: var(--hd-muted); + white-space: nowrap; +} + +.wfBlock { + display: flex; + align-items: center; + gap: 10px; + padding: 8px 10px; + background: var(--hd-surface2); + border-radius: 0; + margin-bottom: 8px; + border: 1px solid var(--hd-border); +} + +.wfBlock:last-child { + margin-bottom: 0; +} + +.wfIcon { + width: 28px; + height: 28px; + border-radius: 0; + display: flex; + align-items: center; + justify-content: center; + font-weight: 700; + font-size: 13px; + flex-shrink: 0; +} + +.wfIconBlue { + background: var(--hd-primary-bg); + color: var(--ifm-color-primary); +} + +.wfIconGreen { + background: var(--hd-green-bg); + color: var(--hd-green); +} + +/* ── SLA comparison table ──────────────────────────────── */ +.slaTable { + width: 100%; + border-collapse: collapse; + margin-bottom: 32px; + font-size: 14px; +} + +.slaTable th { + text-align: left; + padding: 10px 14px; + background: var(--hd-surface2); + border: 1px solid var(--hd-border); + font-weight: 600; + font-size: 12px; + text-transform: uppercase; + letter-spacing: 0.5px; + color: var(--hd-muted); +} + +.slaTable td { + padding: 10px 14px; + border: 1px solid var(--hd-border); + color: var(--ifm-font-color-base); + vertical-align: top; +} + +.slaTable tr:hover td { + background: var(--hd-surface2); +} + +.slaGreen { + color: var(--hd-green); + font-weight: 600; +} + +.slaAmber { + color: var(--hd-amber); + font-weight: 600; +} + +/* ── Replication type cards ────────────────────────────── */ +.rtCard { + display: flex; + gap: 0; + background: var(--hd-surface); + border: 1px solid var(--hd-border); + border-radius: 0; + margin-bottom: 16px; + overflow: hidden; +} + +.rtNumCol { + width: 48px; + flex-shrink: 0; + display: flex; + align-items: flex-start; + justify-content: center; + padding-top: 20px; + background: var(--hd-surface2); + border-right: 1px solid var(--hd-border); +} + +.rtNum { + width: 28px; + height: 28px; + border-radius: 50%; + background: var(--hd-num-badge-bg); + color: #fff; + font-size: 13px; + font-weight: 700; + display: flex; + align-items: center; + justify-content: center; + flex-shrink: 0; +} + +.rtContent { + padding: 20px 24px; + flex: 1; +} + +.rtTitle { + font-size: 17px; + font-weight: 700; + margin: 0 0 4px; + color: var(--ifm-font-color-base); +} + +.rtRole { + font-size: 13px; + color: var(--hd-muted); + margin: 0 0 12px; +} + +.rtCode { + font-family: var(--ifm-font-family-monospace); + font-size: 12px; + background: var(--hd-surface2); + border: 1px solid var(--hd-border); + padding: 10px 14px; + line-height: 1.7; + color: var(--ifm-font-color-base); + margin-top: 12px; +} + +.coverageGrid { + display: grid; + grid-template-columns: repeat(4, 1fr); + gap: 0; + border: 1px solid var(--hd-border); + margin-bottom: 32px; + font-size: 13px; +} + +.coverageHeader { + padding: 10px 12px; + background: var(--hd-surface2); + border-bottom: 1px solid var(--hd-border); + border-right: 1px solid var(--hd-border); + font-weight: 600; + font-size: 12px; + color: var(--hd-muted); + text-transform: uppercase; + letter-spacing: 0.4px; +} + +.coverageHeader:last-child { + border-right: none; +} + +.coverageCell { + padding: 10px 12px; + border-bottom: 1px solid var(--hd-border); + border-right: 1px solid var(--hd-border); + color: var(--ifm-font-color-base); +} + +.coverageCell:last-child { + border-right: none; +} + +.coverageRow:last-child .coverageCell { + border-bottom: none; +} + +.coverageCheck { + color: var(--hd-green); + font-weight: 700; +} + +.coverageDash { + color: var(--hd-muted); +} + +/* ── HowItWorks flow ───────────────────────────────────── */ +.runTabs { + display: flex; + gap: 0; + border-bottom: 1px solid var(--hd-border); + margin-bottom: 24px; +} + +.runTab { + background: none; + border: none; + border-bottom: 2px solid transparent; + color: var(--hd-nav-inactive); + cursor: pointer; + font-size: 14px; + font-family: var(--ifm-font-family-base); + padding: 10px 18px; + margin-bottom: -1px; + transition: color 0.15s, border-color 0.15s; +} + +.runTab:hover { + color: var(--hd-nav-hover); +} + +.runTabActive { + color: var(--hd-nav-active); + border-bottom-color: var(--hd-nav-active); + font-weight: 600; +} + +.flowOuter { + display: grid; + grid-template-columns: 1fr; + gap: 16px; +} + +.flowDiagram { + display: flex; + align-items: center; + gap: 0; + padding: 20px; + background: var(--hd-surface); + border: 1px solid var(--hd-border); +} + +.flowNode { + flex: 1; + padding: 14px 12px; + border: 2px solid var(--hd-border); + border-radius: 0; + text-align: center; + background: var(--hd-surface2); + transition: border-color 0.2s, background 0.2s; + min-width: 120px; +} + +.flowNodeActive { + border-color: var(--ifm-color-primary); + background: var(--hd-primary-bg); +} + +.flowNodeTitle { + font-size: 13px; + font-weight: 600; + color: var(--ifm-font-color-base); + margin-bottom: 4px; +} + +.flowNodeSub { + font-size: 11px; + color: var(--hd-muted); +} + +.flowTrackWrap { + flex: 0 0 60px; + padding: 0 4px; + display: flex; + align-items: center; +} + +.flowTrack { + height: 3px; + width: 100%; + background: var(--hd-border); + border-radius: 0; + overflow: hidden; + position: relative; +} + +.flowTrackFill { + height: 100%; + background: var(--ifm-color-primary); + transition: width 0.5s ease; +} + +.flowTrackFillEmpty { + background: transparent; +} + +.statusLog { + height: 120px; + overflow-y: auto; + background: #0d1117; + border: 1px solid var(--hd-border); + padding: 12px; + font-family: var(--ifm-font-family-monospace); + font-size: 12px; +} + +.logLine { + display: flex; + gap: 8px; + margin-bottom: 4px; + line-height: 1.5; +} + +.logLine:last-child { + margin-bottom: 0; +} + +.logTs { + color: #6b7280; + flex-shrink: 0; + user-select: none; +} + +.logEvent { + flex-shrink: 0; + font-weight: 600; + padding: 0 2px; +} + +.logEventInfo { + color: #60a5fa; +} + +.logEventSuccess { + color: #34d399; +} + +.logEventWarn { + color: #fbbf24; +} + +.logMsg { + color: #d1d5db; +} + +.flowControls { + display: flex; + gap: 8px; +} + +.timeline { + display: flex; + flex-direction: column; + gap: 0; + border: 1px solid var(--hd-border); +} + +.tlStep { + display: flex; + gap: 12px; + padding: 12px 16px; + border-bottom: 1px solid var(--hd-border); + transition: background 0.15s; +} + +.tlStep:last-child { + border-bottom: none; +} + +.tlStepDone { + opacity: 0.5; +} + +.tlStepActive { + background: var(--hd-primary-bg); + border-left: 3px solid var(--ifm-color-primary); +} + +.tlNum { + width: 22px; + height: 22px; + border-radius: 50%; + background: var(--hd-border); + color: var(--hd-muted); + font-size: 11px; + font-weight: 700; + display: flex; + align-items: center; + justify-content: center; + flex-shrink: 0; + margin-top: 1px; + transition: background 0.15s, color 0.15s; +} + +.tlStepDone .tlNum { + background: var(--hd-green); + color: #fff; +} + +.tlStepActive .tlNum { + background: var(--ifm-color-primary); + color: #fff; +} + +.tlContent { + flex: 1; +} + +.tlTitle { + font-size: 13px; + font-weight: 600; + color: var(--ifm-font-color-base); + margin-bottom: 2px; +} + +.tlDesc { + font-size: 12px; + color: var(--hd-muted); + line-height: 1.5; +} + +/* ── Failover section ──────────────────────────────────── */ +.failoverCard { + background: var(--hd-surface); + border: 1px solid var(--hd-border); + border-radius: 0; + padding: 24px; + margin-bottom: 16px; +} + +.failoverGrid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 12px; + margin-top: 16px; +} + +@media (max-width: 640px) { + .failoverGrid { + grid-template-columns: 1fr; + } +} + +.failoverMetric { + padding: 14px; + background: var(--hd-surface2); + border: 1px solid var(--hd-border); +} + +.failoverMetricLabel { + font-size: 11px; + font-weight: 700; + text-transform: uppercase; + letter-spacing: 0.5px; + color: var(--hd-muted); + margin-bottom: 4px; +} + +.failoverMetricValue { + font-size: 18px; + font-weight: 700; + color: var(--ifm-font-color-base); +} + +.failoverSteps { + display: flex; + flex-direction: column; + gap: 0; + margin-top: 16px; + border: 1px solid var(--hd-border); +} + +.failoverStep { + display: flex; + align-items: flex-start; + gap: 12px; + padding: 12px 16px; + border-bottom: 1px solid var(--hd-border); +} + +.failoverStep:last-child { + border-bottom: none; +} + +.failoverStepNum { + width: 22px; + height: 22px; + border-radius: 50%; + background: var(--hd-num-badge-bg); + color: #fff; + font-size: 11px; + font-weight: 700; + display: flex; + align-items: center; + justify-content: center; + flex-shrink: 0; + margin-top: 1px; +} + +.failoverStepText { + font-size: 13px; + color: var(--ifm-font-color-base); + line-height: 1.6; +} + +.failoverStepNote { + font-size: 12px; + color: var(--hd-muted); + margin-top: 3px; +} + +.triggerGrid { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 12px; + margin-top: 12px; +} + +@media (max-width: 640px) { + .triggerGrid { + grid-template-columns: 1fr; + } +} + +.triggerCard { + padding: 16px; + background: var(--hd-surface); + border: 1px solid var(--hd-border); +} + +.triggerCardTitle { + font-size: 13px; + font-weight: 600; + margin-bottom: 8px; + color: var(--ifm-font-color-base); +} + +/* ── Enable It: commands ───────────────────────────────── */ +.cmdBlock { + display: flex; + align-items: flex-start; + gap: 8px; + background: #0d1117; + padding: 10px 14px; + font-family: var(--ifm-font-family-monospace); + font-size: 13px; + margin-bottom: 8px; + line-height: 1.6; + border-left: 3px solid var(--ifm-color-primary); +} + +.cmdPrompt { + color: #6b7280; + flex-shrink: 0; + user-select: none; +} + +.cmdText { + color: #d1d5db; + word-break: break-all; +} + +.cmdOutput { + font-family: var(--ifm-font-family-monospace); + font-size: 12px; + color: var(--hd-green); + padding: 6px 14px 10px; + margin-bottom: 16px; +} + +.enableSectionLabel { + font-size: 12px; + font-weight: 700; + text-transform: uppercase; + letter-spacing: 0.5px; + color: var(--hd-muted); + margin: 20px 0 8px; +} + +.enableNote { + display: flex; + gap: 10px; + background: var(--hd-amber-bg); + border-left: 3px solid var(--hd-amber); + padding: 12px 16px; + margin: 16px 0; + font-size: 13px; + color: var(--ifm-font-color-base); + line-height: 1.6; +} + +.enableNoteIcon { + flex-shrink: 0; + font-size: 16px; +} + +/* ── Quiz ──────────────────────────────────────────────── */ +.quizCard { + background: var(--hd-surface); + border: 1px solid var(--hd-border); + padding: 20px; + margin-bottom: 16px; +} + +.quizQ { + font-size: 15px; + font-weight: 600; + color: var(--ifm-font-color-base); + margin-bottom: 14px; + line-height: 1.5; +} + +.quizOptions { + display: flex; + flex-direction: column; + gap: 8px; +} + +.quizOpt { + text-align: left; + background: var(--hd-surface2); + border: 1px solid var(--hd-border); + border-radius: 0; + padding: 10px 14px; + font-size: 14px; + font-family: var(--ifm-font-family-base); + color: var(--ifm-font-color-base); + cursor: pointer; + transition: border-color 0.15s, background 0.15s; +} + +.quizOpt:hover:not(:disabled) { + border-color: var(--ifm-color-primary); + background: var(--hd-primary-bg); +} + +.quizOpt:disabled { + cursor: default; +} + +.quizOptCorrect { + border-color: var(--hd-green) !important; + background: var(--hd-green-bg) !important; + color: var(--hd-green) !important; + font-weight: 600; +} + +.quizOptWrong { + border-color: var(--hd-red) !important; + background: var(--hd-red-bg) !important; + color: var(--hd-red) !important; +} + +.quizFeedback { + margin-top: 12px; + padding: 10px 14px; + font-size: 13px; + line-height: 1.6; + border-left: 3px solid transparent; +} + +.quizFeedbackCorrect { + background: var(--hd-green-bg); + border-left-color: var(--hd-green); + color: var(--ifm-font-color-base); +} + +.quizFeedbackWrong { + background: var(--hd-red-bg); + border-left-color: var(--hd-red); + color: var(--ifm-font-color-base); +} + +.scoreCard { + display: flex; + align-items: center; + gap: 20px; + padding: 20px 24px; + background: var(--hd-surface); + border: 1px solid var(--hd-border); + margin-top: 24px; +} + +.scoreText { + font-size: 18px; + font-weight: 700; + color: var(--ifm-font-color-base); +} + +/* ── Resource links ────────────────────────────────────── */ +.resourceLink { + display: inline-flex; + align-items: center; + gap: 4px; + color: var(--ifm-color-primary); + font-size: 13px; + font-weight: 600; + text-decoration: none; + border-bottom: 1px solid transparent; + transition: border-color 0.15s; +} + +.resourceLink:hover { + border-bottom-color: var(--ifm-color-primary); + text-decoration: none; +} diff --git a/src/components/HADemo/HowItWorks.tsx b/src/components/HADemo/HowItWorks.tsx new file mode 100644 index 0000000000..c99a562f68 --- /dev/null +++ b/src/components/HADemo/HowItWorks.tsx @@ -0,0 +1,227 @@ +import React, { useCallback, useEffect, useRef, useState } from 'react'; +import styles from './HADemo.module.css'; +import { + failoverNodes, + failoverSteps, + FlowMode, + FlowStep, + replicationNodes, + replicationSteps, +} from './replicationSteps'; + +type Props = { onNext: () => void }; + +const AUTO_ADVANCE_MS = 1800; + +function getTimestamp(): string { + const now = new Date(); + return `${String(now.getHours()).padStart(2, '0')}:${String(now.getMinutes()).padStart(2, '0')}:${String(now.getSeconds()).padStart(2, '0')}`; +} + +type LogEntry = { ts: string; step: FlowStep }; + +export default function HowItWorks({ onNext }: Props) { + const [mode, setMode] = useState('replication'); + const [stepIdx, setStepIdx] = useState(-1); + const [playing, setPlaying] = useState(false); + const [log, setLog] = useState([]); + const timerRef = useRef | null>(null); + const logRef = useRef(null); + + const steps = mode === 'replication' ? replicationSteps : failoverSteps; + const nodes = mode === 'replication' ? replicationNodes : failoverNodes; + const currentStep = stepIdx >= 0 ? steps[stepIdx] : null; + + const advance = useCallback( + (idx: number) => { + const next = Math.min(steps.length - 1, idx); + setStepIdx(next); + setLog((prev) => [...prev, { ts: getTimestamp(), step: steps[next] }]); + }, + [steps], + ); + + useEffect(() => { + if (playing) { + timerRef.current = setTimeout(() => { + if (stepIdx < steps.length - 1) { + advance(stepIdx + 1); + } else { + setPlaying(false); + } + }, AUTO_ADVANCE_MS); + } + return () => { if (timerRef.current) clearTimeout(timerRef.current); }; + }, [playing, stepIdx, steps.length, advance]); + + useEffect(() => { + if (logRef.current) logRef.current.scrollTop = logRef.current.scrollHeight; + }, [log]); + + function handleModeChange(next: FlowMode) { + setMode(next); + setStepIdx(-1); + setPlaying(false); + setLog([]); + if (timerRef.current) clearTimeout(timerRef.current); + } + + function handleRunDemo() { + if (playing) { + setPlaying(false); + return; + } + if (stepIdx >= steps.length - 1) { + setStepIdx(0); + setLog([{ ts: getTimestamp(), step: steps[0] }]); + setPlaying(true); + } else if (stepIdx === -1) { + advance(0); + setPlaying(true); + } else { + setPlaying(true); + } + } + + function handleReset() { + setPlaying(false); + setStepIdx(-1); + setLog([]); + if (timerRef.current) clearTimeout(timerRef.current); + } + + const packetPct = currentStep?.packetPct ?? 0; + + return ( +
+
+
+
+ +

How Replication and Failover Work

+

+ Step through the lifecycle of an HA Namespace — from normal async replication to an + end-to-end failover. Toggle between the two modes to see how each plays out. +

+ +
+ + +
+ +
+ {/* Flow diagram */} +
+ {nodes.map((node, i) => ( + +
+
{node.title}
+
{node.sub}
+
+ {i < nodes.length - 1 && (() => { + const fillW = + i === 0 + ? packetPct >= 50 ? 100 : packetPct * 2 + : packetPct >= 100 ? 100 : packetPct > 50 ? (packetPct - 50) * 2 : 0; + return ( +
+
+
+
+
+ ); + })()} + + ))} +
+ + {/* Status log */} +
+ {log.length === 0 ? ( +
+ [ready]  + Press "Run Demo" to start the animation +
+ ) : ( + log.map((entry, i) => ( +
+ {entry.ts}  + + {entry.step.log.event}  + + {entry.step.log.msg} +
+ )) + )} +
+ + {/* Controls */} +
+ + +
+ + {/* Timeline */} +
+ {steps.map((step, i) => ( +
+
{i + 1}
+
+
{step.label}
+
{step.detail}
+
+
+ ))} +
+
+ +
+ +
+
+ ); +} diff --git a/src/components/HADemo/Overview.tsx b/src/components/HADemo/Overview.tsx new file mode 100644 index 0000000000..a77389a7b9 --- /dev/null +++ b/src/components/HADemo/Overview.tsx @@ -0,0 +1,164 @@ +import React from 'react'; +import styles from './HADemo.module.css'; + +type Props = { onNext: () => void }; + +export default function Overview({ onNext }: Props) { + return ( +
+
+
+
+ +

What is Temporal High Availability?

+

+ Temporal Cloud High Availability gives every Namespace a synchronized replica — a standby + that can take over in under 20 minutes, with less than 1 minute of data loss, regardless of + whether an availability zone, an entire region, or a whole cloud provider fails. +

+ +
+

+ The core idea in one sentence +

+

+ Your Namespace asynchronously replicates all Workflow history to a replica in a separate + isolation domain — so if the primary fails, Temporal promotes the replica and your + Workflows keep running with minimal data loss. +

+
+ +

The Problem HA Solves

+
+
+
Without HA
+

Single isolation domain

+

+ Standard Namespaces replicate across three availability zones in one region, handling + AZ-level failures automatically. But a cell-level bug, a regional outage, or a + cloud-provider incident can take your Namespace offline with no automatic recovery path. + Backups run every 4 hours — meaning hours of potential data loss. +

+
+
+
With HA
+

Active primary + synchronized replica

+

+ Every Workflow Execution is asynchronously replicated to a replica in a separate + isolation domain. Temporal Cloud continuously monitors health and automatically fails + over to the replica — restoring service within the RTO window with sub-1-minute data + loss. +

+
+
+ +

What It Looks Like

+
+
+
+ Primary · Active +
+
+
W
+
+
Workflow Executions
+
Writes accepted here
+
+
+
+
3
+
+
3-zone replication
+
Within the isolation domain
+
+
+
+ +
+
+
Async
+
replication
+
+ p95 < 1 min lag +
+
+ +
+
+ Replica · Standby +
+
+
W
+
+
Workflow history mirror
+
Ready to promote
+
+
+
+
3
+
+
Separate isolation domain
+
Different region or cell
+
+
+
+
+ +

Standard vs. High Availability

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricStandard NamespaceHA Namespace
Uptime SLA99.9%99.99%
AZ-level failure✓ Automatic (zero RPO)✓ Automatic (zero RPO)
Cell / regional failure RPOUp to 4 hours (backup)< 1 minute
Cell / regional failure RTOManual intervention20 minutes
Automatic failover✓ Temporal-initiated (default)
Replication lag (p95)N/A< 1 minute
+ +
+ +
+
+ ); +} diff --git a/src/components/HADemo/Quiz.tsx b/src/components/HADemo/Quiz.tsx new file mode 100644 index 0000000000..01917d01bb --- /dev/null +++ b/src/components/HADemo/Quiz.tsx @@ -0,0 +1,168 @@ +import React, { useState } from 'react'; +import styles from './HADemo.module.css'; +import { quizQuestions } from './quizQuestions'; + +type AnswerState = number | null; + +export default function Quiz() { + const [answers, setAnswers] = useState( + () => new Array(quizQuestions.length).fill(null), + ); + const [key, setKey] = useState(0); + + const allAnswered = answers.every((a) => a !== null); + const correctCount = answers.filter((a, i) => a === quizQuestions[i].correct).length; + const pct = Math.round((correctCount / quizQuestions.length) * 100); + + function answer(qi: number, oi: number) { + if (answers[qi] !== null) return; + setAnswers((prev) => { + const next = [...prev]; + next[qi] = oi; + return next; + }); + } + + function reset() { + setAnswers(new Array(quizQuestions.length).fill(null)); + setKey((k) => k + 1); + } + + return ( +
+
+
+
+ +

Test Your Understanding

+ +
+ {quizQuestions.map((q, qi) => { + const given = answers[qi]; + const answered = given !== null; + + return ( +
+
+ Q{qi + 1}. {q.q} +
+
+ {q.options.map((opt, oi) => { + const isCorrect = oi === q.correct; + const isChosen = oi === given; + + let optClass = styles.quizOpt; + if (answered && isChosen && isCorrect) optClass += ` ${styles.quizOptCorrect}`; + else if (answered && isChosen && !isCorrect) optClass += ` ${styles.quizOptWrong}`; + else if (answered && isCorrect) optClass += ` ${styles.quizOptCorrect}`; + + return ( + + ); + })} +
+ + {answered && ( +
+ {given === q.correct ? 'Correct: ' : 'Not quite: '} + {q.explanation} +
+ )} +
+ ); + })} +
+ + {allAnswered && ( +
+
+ {correctCount} / {quizQuestions.length} correct ({pct}%) +
+ +
+ )} + +

Resources

+
+
+
Overview
+

High Availability

+

+ Introduction to Temporal Cloud HA, replication types, SLAs, and target workloads. +

+ + Read the docs → + +
+ +
+
Setup
+

Enable High Availability

+

+ Step-by-step instructions for enabling HA via the Web UI and tcld CLI. +

+ + Enable HA → + +
+ +
+
Operations
+

Configure and Trigger Failovers

+

+ Automatic failover behavior, manual failover triggers, graceful vs. forced modes. +

+ + Failover docs → + +
+ +
+
SLAs
+

RPO and RTO

+

+ Detailed Recovery Point Objective and Recovery Time Objective targets by outage type. +

+ + RPO / RTO → + +
+ +
+
Monitoring
+

Monitor Replication Health

+

+ Replication lag metrics, replica health status, and audit logging for failover events. +

+ + Monitoring docs → + +
+ +
+
Networking
+

HA Connectivity

+

+ DNS configuration, AWS PrivateLink setup, and routing requirements for HA Namespaces. +

+ + Connectivity docs → + +
+
+
+ ); +} diff --git a/src/components/HADemo/ReplicationTypes.tsx b/src/components/HADemo/ReplicationTypes.tsx new file mode 100644 index 0000000000..25d02ed862 --- /dev/null +++ b/src/components/HADemo/ReplicationTypes.tsx @@ -0,0 +1,156 @@ +import React from 'react'; +import styles from './HADemo.module.css'; + +type Props = { onNext: () => void }; + +export default function ReplicationTypes({ onNext }: Props) { + return ( +
+
+
+
+ +

Three Ways to Deploy High Availability

+

+ Temporal Cloud offers three replication models. Each protects against a different failure + scope. Choose based on what level of isolation your workload requires. +

+ + {/* 1: Same-region */} +
+
+
1
+
+
+

+ Same-region Replication + Public Preview +

+

Isolation domains within a single region

+

+ The Primary and Replica Namespaces live in separate isolation domains (cells) + within the same region. Each cell is an independent deployment of the Temporal + service — different databases, different compute, different failure blast radius. +

+

+ This protects against cell-level software bugs or sub-component database failures that + survive all three availability zones in a single cell. Because the replica stays in the + same region, Workers don't need multi-region connectivity and latency impact is minimal. +

+
+ Primary cell: aws-us-east-1 (cell A)
+ Replica cell: aws-us-east-1 (cell B)
+ // same region, separate isolation domains +
+
+
+ + {/* 2: Multi-region */} +
+
+
2
+
+
+

+ Multi-region Replication + General Availability +

+

Isolation domains in separate regions of the same cloud

+

+ The Primary and Replica Namespaces run in different geographic regions of + the same cloud provider (e.g. AWS us-east-1 and AWS us-west-2). This protects against + everything same-region covers, plus full region-wide incidents. +

+

+ During failover, the Namespace CNAME record updates to the new active region. Workers + deployed in the replica region resume processing with near-zero latency. Workers only + in the primary region will experience cross-region latency until they are relocated or + replicas are deployed. +

+
+ Primary: aws-us-east-1 → ha-namespace.acct.tmprl.cloud
+ After failover: aws-us-west-2 → ha-namespace.acct.tmprl.cloud
+ // endpoint unchanged; CNAME target updates (15 s TTL) +
+
+
+ + {/* 3: Multi-cloud */} +
+
+
3
+
+
+

+ Multi-cloud Replication + Public Preview +

+

Isolation domains across different cloud providers

+

+ The Primary and Replica run on entirely different cloud providers (e.g. + AWS and GCP). Traffic automatically shifts to the replica if the entire primary cloud + provider experiences a widespread outage. This is the broadest failure protection + available. +

+

+ Multi-cloud replication carries the highest operational complexity, including managing + worker fleets on two cloud providers and routing inter-cloud network traffic. Use this + when contractual obligations require cloud-provider-level redundancy. +

+
+ Primary: AWS us-east-1
+ Replica: GCP us-central1
+ // independent cloud providers, maximum blast-radius isolation +
+
+
+ +

Failure Coverage by Replication Type

+
+
Failure type
+
Same-region
+
Multi-region
+
Multi-cloud
+ +
Single AZ failure
+
(also standard)
+
+
+ +
Cell / sub-component failure
+
+
+
+ +
Regional outage
+
+
+
+ +
Cloud-provider outage
+
+
+
+
+ +
+
Worker deployment tip
+

+ For multi-region and multi-cloud replication, Temporal strongly recommends deploying + Workers in both the primary and replica regions. Workers in the primary region will + experience cross-region latency after failover until they are updated to point at the new + active region. Workers already running in the replica region pick up tasks immediately. +

+
+ +
+ +
+
+ ); +} diff --git a/src/components/HADemo/index.tsx b/src/components/HADemo/index.tsx new file mode 100644 index 0000000000..fa07be6631 --- /dev/null +++ b/src/components/HADemo/index.tsx @@ -0,0 +1,53 @@ +import React, { useState } from 'react'; +import EnableIt from './EnableIt'; +import FailoverSection from './FailoverSection'; +import HowItWorks from './HowItWorks'; +import styles from './HADemo.module.css'; +import Overview from './Overview'; +import Quiz from './Quiz'; +import ReplicationTypes from './ReplicationTypes'; + +type SectionId = 'overview' | 'replication' | 'howitworks' | 'failover' | 'enableit' | 'quiz'; + +const NAV: { id: SectionId; label: string }[] = [ + { id: 'overview', label: 'What is HA?' }, + { id: 'replication', label: 'Replication Types' }, + { id: 'howitworks', label: 'How It Works' }, + { id: 'failover', label: 'Failover' }, + { id: 'enableit', label: 'Enable It' }, + { id: 'quiz', label: 'Test Yourself' }, +]; + +export default function HADemo() { + const [active, setActive] = useState('overview'); + + function next(current: SectionId) { + const idx = NAV.findIndex((n) => n.id === current); + if (idx < NAV.length - 1) setActive(NAV[idx + 1].id); + } + + return ( +
+ + + {active === 'overview' && next('overview')} />} + {active === 'replication' && next('replication')} />} + {active === 'howitworks' && next('howitworks')} />} + {active === 'failover' && next('failover')} />} + {active === 'enableit' && next('enableit')} />} + {active === 'quiz' && } +
+ ); +} diff --git a/src/components/HADemo/quizQuestions.ts b/src/components/HADemo/quizQuestions.ts new file mode 100644 index 0000000000..520023dd18 --- /dev/null +++ b/src/components/HADemo/quizQuestions.ts @@ -0,0 +1,66 @@ +export type QuizQuestion = { + q: string; + options: readonly string[]; + correct: number; + explanation: string; +}; + +export const quizQuestions: QuizQuestion[] = [ + { + q: 'What does RPO (Recovery Point Objective) measure?', + options: [ + 'How fast a system recovers after an outage', + 'The maximum acceptable amount of data loss measured in time', + 'The number of replicas required for HA', + 'The replication throughput in events per second', + ], + correct: 1, + explanation: + 'RPO measures how much data could be lost — "how far back in time could we roll back?" Temporal Cloud HA targets a sub-1-minute RPO, meaning at most ~1 minute of Workflow history could be lost during a regional failover.', + }, + { + q: "What contractual SLA does Temporal Cloud's High Availability tier provide?", + options: ['99.9%', '99.95%', '99.99%', '100%'], + correct: 2, + explanation: + 'HA Namespaces carry a 99.99% uptime SLA. Standard Namespaces receive 99.9%. The extra nine is achieved through a synchronized replica that can take over within the RTO window.', + }, + { + q: 'What is the RTO (Recovery Time Objective) for a High Availability Namespace?', + options: ['Instant (zero)', 'Under 1 minute', '20 minutes', '1 hour'], + correct: 2, + explanation: + 'Temporal Cloud HA targets a 20-minute RTO — the maximum time to restore service after a cell or regional failure. The RPO is sub-1-minute (data loss), while RTO covers the recovery duration.', + }, + { + q: 'Which failover mode does Temporal Cloud use by default?', + options: [ + 'Graceful (always waits for replication to drain)', + 'Forced (immediately promotes replica)', + 'Hybrid (tries graceful for 10 s, then forces)', + 'Manual only (no automatic failover)', + ], + correct: 2, + explanation: + 'Hybrid is the default. It first attempts a graceful handover (up to 10 seconds) to drain in-flight replication tasks, then automatically switches to a forced failover if the graceful window expires — balancing consistency and availability.', + }, + { + q: 'What outage type does same-region replication protect against, but standard 3-zone replication does NOT?', + options: [ + 'Individual availability zone failures', + 'Cell or sub-component failures within a region', + 'Complete AWS or GCP cloud-provider outages', + 'Network packet loss between workers', + ], + correct: 1, + explanation: + 'Standard Namespaces replicate across three availability zones and handle AZ failures. Same-region HA adds a second isolation domain (cell), protecting against cell-level software bugs or database failures that survive across all three zones.', + }, + { + q: 'After removing a replica from a High Availability Namespace, how long must you wait before re-enabling HA in the same region?', + options: ['1 hour', '24 hours', '3 days', '7 days'], + correct: 3, + explanation: + 'Temporal enforces a 7-day cooldown before you can re-enable replication in the same location after removing it. This prevents rapid cycling that could create inconsistent state during the replica rebuild period.', + }, +]; diff --git a/src/components/HADemo/replicationSteps.ts b/src/components/HADemo/replicationSteps.ts new file mode 100644 index 0000000000..bc5e7c0712 --- /dev/null +++ b/src/components/HADemo/replicationSteps.ts @@ -0,0 +1,128 @@ +export type FlowMode = 'replication' | 'failover'; +export type LogLevel = 'info' | 'success' | 'warn'; + +export type FlowStep = { + label: string; + detail: string; + /** 0 = left node, 1 = middle node, 2 = right node */ + activeNode: 0 | 1 | 2; + /** Progress along the tracks: 0–100 */ + packetPct: number; + log: { level: LogLevel; event: string; msg: string }; +}; + +export type NodeDef = { title: string; sub: string }; + +export const replicationNodes: NodeDef[] = [ + { title: 'Primary Namespace', sub: 'Active · writes accepted' }, + { title: 'Replication Layer', sub: 'Async · background' }, + { title: 'Replica Namespace', sub: 'Standby · read-only' }, +]; + +export const failoverNodes: NodeDef[] = [ + { title: 'Primary Namespace', sub: 'Failing · degraded' }, + { title: 'HA Engine', sub: 'Detection · coordination' }, + { title: 'Replica Namespace', sub: 'Promoting → Active' }, +]; + +export const replicationSteps: FlowStep[] = [ + { + label: 'Workflow event written to Primary', + detail: + 'A Workflow Execution event (e.g. WorkflowExecutionStarted) is durably recorded in the Primary Namespace and acknowledged to the client. Replication happens independently in the background.', + activeNode: 0, + packetPct: 0, + log: { level: 'info', event: 'WRITE', msg: 'Event persisted in Primary: WorkflowExecutionStarted' }, + }, + { + label: 'Replication task queued', + detail: + 'The replication engine queues an asynchronous task to copy the new History Event to the Replica. Client latency is unaffected — the client already received its acknowledgment.', + activeNode: 0, + packetPct: 20, + log: { level: 'info', event: 'QUEUE', msg: 'Replication task enqueued for Replica Namespace' }, + }, + { + label: 'Event transmitted to Replica', + detail: + 'The Replication Layer transmits the History Event and its version stamp to the Replica Namespace. Temporal Cloud targets a P95 replication lag of under 1 minute.', + activeNode: 1, + packetPct: 50, + log: { level: 'info', event: 'TRANSMIT', msg: 'Replication in-flight → Replica (p95 < 1 min lag)' }, + }, + { + label: 'Replica persists the event', + detail: + 'The Replica Namespace persists the replicated event across its own availability zones and stores the version history entry used for conflict resolution during failover.', + activeNode: 2, + packetPct: 100, + log: { level: 'success', event: 'PERSIST', msg: 'Replica wrote event with version stamp' }, + }, + { + label: 'Replica ready for failover', + detail: + 'The Replica is synchronized and healthy. All Visibility APIs remain queryable against both Primary and Replica. The system is ready to fail over with sub-1-minute data loss if needed.', + activeNode: 2, + packetPct: 100, + log: { level: 'success', event: 'READY', msg: 'Replica health OK — RPO target achievable' }, + }, +]; + +export const failoverSteps: FlowStep[] = [ + { + label: 'Health check detects degradation', + detail: + 'Temporal Cloud monitors the Primary Namespace continuously. When error rates, latencies, or infrastructure checks exceed failure thresholds, the HA Engine triggers a failover.', + activeNode: 0, + packetPct: 0, + log: { level: 'warn', event: 'HEALTH', msg: 'Primary health check failed — thresholds exceeded' }, + }, + { + label: 'Hybrid failover initiated (10 s window)', + detail: + 'The default Hybrid mode first attempts a graceful handover: the Primary stops accepting new writes and waits up to 10 seconds for in-flight replication tasks to drain to the Replica.', + activeNode: 1, + packetPct: 50, + log: { level: 'warn', event: 'FAILOVER', msg: 'Hybrid: graceful handover attempted (10 s timeout)' }, + }, + { + label: 'Replica promoted to active', + detail: + 'After the graceful window (or immediately in forced mode), the Replica Namespace is promoted to active. It begins accepting Workflow traffic. The old Primary enters standby mode.', + activeNode: 2, + packetPct: 100, + log: { level: 'success', event: 'PROMOTE', msg: 'Replica promoted — now accepting writes' }, + }, + { + label: 'DNS CNAME updated', + detail: + 'The Namespace CNAME record is updated to point to the new active region (15 s TTL). Clients and Workers begin resolving to the new active endpoint within approximately 30 seconds.', + activeNode: 2, + packetPct: 100, + log: { level: 'info', event: 'DNS', msg: 'CNAME updated → new region (15 s TTL, ~30 s convergence)' }, + }, + { + label: 'Conflict resolution applied', + detail: + 'Any events that were in-flight and not yet replicated undergo conflict resolution. The history branch with the highest Namespace version is authoritative. Signals are re-injected; some Activity progress may roll back.', + activeNode: 1, + packetPct: 50, + log: { level: 'info', event: 'RESOLVE', msg: 'Conflict resolution: highest-version branch wins' }, + }, + { + label: 'Old Primary demoted to Replica', + detail: + 'The original Primary becomes the new standby Replica. Outstanding Activities will time out and be retried on the new active Namespace — handle these like any Worker restart.', + activeNode: 0, + packetPct: 0, + log: { level: 'info', event: 'DEMOTE', msg: 'Original Primary is now Replica (standby)' }, + }, + { + label: 'Failover complete', + detail: + 'Traffic is fully shifted. Workflow Executions continue from their last replicated state. Your Namespace endpoint URL and credentials are unchanged — no client updates required.', + activeNode: 2, + packetPct: 100, + log: { level: 'success', event: 'COMPLETE', msg: 'Failover complete — RTO target: 20 min' }, + }, +];