temporalio
diff --git a/‎updates_and_signals/safe_message_handlers/README.md
Lines changed: 1 addition & 1 deletion b/‎updates_and_signals/safe_message_handlers/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎updates_and_signals/safe_message_handlers/package.json
Lines changed: 6 additions & 3 deletions b/‎updates_and_signals/safe_message_handlers/package.json
Lines changed: 6 additions & 3 deletions
diff --git a/‎updates_and_signals/safe_message_handlers/src/activities.ts
Lines changed: 14 additions & 9 deletions b/‎updates_and_signals/safe_message_handlers/src/activities.ts
Lines changed: 14 additions & 9 deletions
diff --git a/‎updates_and_signals/safe_message_handlers/src/client.ts
Lines changed: 7 additions & 38 deletions b/‎updates_and_signals/safe_message_handlers/src/client.ts
Lines changed: 7 additions & 38 deletions
diff --git a/‎updates_and_signals/safe_message_handlers/src/cluster-manager.ts
Lines changed: 155 additions & 0 deletions b/‎updates_and_signals/safe_message_handlers/src/cluster-manager.ts
Lines changed: 155 additions & 0 deletions
diff --git a/‎updates_and_signals/safe_message_handlers/src/run-simulation.ts
Lines changed: 36 additions & 0 deletions b/‎updates_and_signals/safe_message_handlers/src/run-simulation.ts
Lines changed: 36 additions & 0 deletions
diff --git a/‎updates_and_signals/safe_message_handlers/src/test.ts
Lines changed: 65 additions & 0 deletions b/‎updates_and_signals/safe_message_handlers/src/test.ts
Lines changed: 65 additions & 0 deletions
@@ -3,4 +3,4 @@
 1. `npm install` to install dependencies.
 1. `temporal server start-dev` to start [Temporal Server](https://github.com/temporalio/cli/#installation).
 1. `npm run start.watch` to start the Worker.
-1. In another shell, `npm run workflow` to run the Workflow Client.
+1. In another shell, `npm run simulation` to run a simulation of the cluster manager.
@@ -8,9 +8,9 @@
     "lint": "eslint .",
     "start": "ts-node src/worker.ts",
     "start.watch": "nodemon src/worker.ts",
-    "workflow": "ts-node src/client.ts",
+    "simulation": "ts-node src/run-simulation.ts",
     "format": "prettier --config .prettierrc 'src/**/*.ts' --write",
-    "test": "mocha --exit --require ts-node/register --require source-map-support/register src/mocha/*.test.ts"
+    "test": "ts-node src/test.ts"
   },
   "nodemonConfig": {
     "execMap": {
@@ -26,13 +26,16 @@
     "@temporalio/client": "^1.9.0",
     "@temporalio/worker": "^1.9.0",
     "@temporalio/workflow": "^1.9.0",
-    "nanoid": "3.x"
+    "async-mutex": "^0.5.0",
+    "nanoid": "3.x",
+    "uuid": "^10.0.0"
   },
   "devDependencies": {
     "@temporalio/testing": "^1.9.0",
     "@tsconfig/node16": "^1.0.0",
     "@types/mocha": "8.x",
     "@types/node": "^16.11.43",
+    "@types/uuid": "^10.0.0",
     "@typescript-eslint/eslint-plugin": "^5.0.0",
     "@typescript-eslint/parser": "^5.0.0",
     "eslint": "^7.32.0",
 
@@ -1,29 +1,30 @@
-export interface AllocateNodesToJobInput {
+interface AssignNodesToJobInput {
   nodes: string[];
   jobName: string;
 }
 
-export interface DeallocateNodesForJobInput {
+interface UnassignNodesForJobInput {
   nodes: string[];
   jobName: string;
 }
 
-export interface FindBadNodesInput {
+interface FindBadNodesInput {
   nodesToCheck: string[];
 }
 
-export async function allocateNodesToJob(input: AllocateNodesToJobInput): Promise<void> {
+export async function assignNodesToJob(input: AssignNodesToJobInput): Promise<void> {
   console.log(`Assigning nodes ${input.nodes} to job ${input.jobName}`);
-  await new Promise((resolve) => setTimeout(resolve, 100)); // Simulate async operation
+  await sleep(100); // Simulate RPC
 }
 
-export async function deallocateNodesForJob(input: DeallocateNodesForJobInput): Promise<void> {
-  console.log(`Deallocating nodes ${input.nodes} from job ${input.jobName}`);
-  await new Promise((resolve) => setTimeout(resolve, 100)); // Simulate async operation
+export async function unassignNodesForJob(input: UnassignNodesForJobInput): Promise<void> {
+  console.log(`Unassigning nodes ${input.nodes} from job ${input.jobName}`);
+  await sleep(100); // Simulate RPC
 }
 
 export async function findBadNodes(input: FindBadNodesInput): Promise<string[]> {
-  await new Promise((resolve) => setTimeout(resolve, 100)); // Simulate async operation
+  console.log('Finding bad nodes');
+  await sleep(100); // Simulate RPC
   const badNodes = input.nodesToCheck.filter((n) => parseInt(n) % 5 === 0);
   if (badNodes.length) {
     console.log(`Found bad nodes: ${badNodes}`);
@@ -32,3 +33,7 @@ export async function findBadNodes(input: FindBadNodesInput): Promise<string[]>
   }
   return badNodes;
 }
+
+async function sleep(ms: number): Promise<void> {
+  await new Promise((resolve) => setTimeout(resolve, ms));
+}
@@ -1,45 +1,14 @@
 import { Connection, Client, WorkflowHandle } from '@temporalio/client';
-import * as workflow from './workflows';
+import { v4 as uuid } from 'uuid';
 
-async function doClusterLifecycle(wf: WorkflowHandle, delaySeconds?: number): Promise<void> {
-  await wf.signal(workflow.startClusterSignal);
+import { clusterManagerWorkflow } from './workflows';
 
-  const allocationUpdates: Promise<any>[] = [];
-  for (let i = 0; i < 6; i++) {
-    allocationUpdates.push(
-      wf.executeUpdate(workflow.allocateNodesToJobUpdate, { args: [{ numNodes: 2, jobName: `task-${i}` }] })
-    );
-  }
-  await Promise.all(allocationUpdates);
-
-  if (delaySeconds) {
-    await new Promise((resolve) => setTimeout(resolve, delaySeconds * 1000));
-  }
-
-  const deletionUpdates: Promise<any>[] = [];
-  for (let i = 0; i < 6; i++) {
-    deletionUpdates.push(wf.executeUpdate(workflow.deleteJobUpdate, { args: [{ jobName: `task-${i}` }] }));
-  }
-  await Promise.all(deletionUpdates);
-
-  await wf.signal(workflow.shutdownClusterSignal);
-}
-async function main() {
+export async function startClusterManager(): Promise<WorkflowHandle<typeof clusterManagerWorkflow>> {
   const connection = await Connection.connect({ address: 'localhost:7233' });
   const client = new Client({ connection });
-
-  // Define the workflow handle
-  const wfHandle = await client.workflow.start(workflow.clusterManagerWorkflow, {
-    args: [{ testContinueAsNew: true }],
-    taskQueue: 'tq',
-    workflowId: 'cluster-management-workflow',
+  return client.workflow.start(clusterManagerWorkflow, {
+    args: [{}],
+    taskQueue: 'safe-message-handlers-task-queue',
+    workflowId: `cluster-manager-${uuid()}`,
   });
-
-  // Start the cluster lifecycle
-  await doClusterLifecycle(wfHandle);
 }
-
-main().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
@@ -0,0 +1,155 @@
+import * as wf from '@temporalio/workflow';
+import type * as activities from './activities';
+import * as _3rdPartyAsyncMutexLibrary from 'async-mutex';
+import {
+  AssignNodesToJobUpdateInput,
+  ClusterManagerState,
+  ClusterManagerStateSummary,
+  DeleteJobUpdateInput,
+} from './types';
+
+const { assignNodesToJob, unassignNodesForJob } = wf.proxyActivities<typeof activities>({
+  startToCloseTimeout: '1 minute',
+});
+
+const { findBadNodes } = wf.proxyActivities<typeof activities>({
+  startToCloseTimeout: '1 minute',
+  retry: {
+    // This activity is called with the nodexMutex held. We do not retry, since retries would block
+    // cluster operations.
+    maximumAttempts: 1,
+  },
+});
+
+// ClusterManagerWorkflow keeps track of the job assignments of a cluster of nodes. It exposes an
+// API to started and shutdown the cluster, to assign jobs to nodes, and to delete jobs. The
+// workflow maps this API to signals and updates. Operations altering node assignments must not
+// interleave (must be serialized), and a standard (non-Temporal-specific) async mutex from a 3rd
+// party library is used to ensure this.
+export class ClusterManager {
+  state: ClusterManagerState;
+  jobsWithNodesAssigned: Set<string>;
+  nodesMutex: _3rdPartyAsyncMutexLibrary.Mutex;
+
+  constructor(state?: ClusterManagerState) {
+    this.state = state ?? {
+      clusterStarted: false,
+      clusterShutdown: false,
+      nodes: new Map<string, string | null>(),
+      maxAssignedNodes: 0,
+    };
+    this.jobsWithNodesAssigned = new Set<string>();
+    this.nodesMutex = new _3rdPartyAsyncMutexLibrary.Mutex();
+  }
+
+  startCluster(): void {
+    this.state.clusterStarted = true;
+    for (let i = 0; i < 25; i++) {
+      this.state.nodes.set(i.toString(), null);
+    }
+    wf.log.info('Cluster started');
+  }
+
+  async shutDownCluster(): Promise<void> {
+    await wf.condition(() => this.state.clusterStarted);
+    this.state.clusterShutdown = true;
+    wf.log.info('Cluster shutdown');
+  }
+
+  async assignNodesToJob(input: AssignNodesToJobUpdateInput): Promise<ClusterManagerStateSummary> {
+    await wf.condition(() => this.state.clusterStarted);
+    if (this.state.clusterShutdown) {
+      // If you want the client to receive a failure, either add an update validator and throw the
+      // exception from there, or raise an ApplicationError. Other exceptions in the handler will
+      // cause the workflow to keep retrying and get it stuck.
+      throw new wf.ApplicationFailure('Cannot assign nodes to a job: Cluster is already shut down');
+    }
+    return await this.nodesMutex.runExclusive(async (): Promise<ClusterManagerStateSummary> => {
+      // Idempotency guard: do nothing if the job already has nodes assigned.
+      if (!new Set(this.state.nodes.values()).has(input.jobName)) {
+        const unassignedNodes = this.getUnassignedNodes();
+        if (input.numNodes > unassignedNodes.size) {
+          throw new wf.ApplicationFailure(
+            `Cannot assign ${input.numNodes} nodes; have only ${unassignedNodes.size} available`
+          );
+        }
+        const nodesToAssign = Array.from(unassignedNodes).slice(0, input.numNodes);
+        // This await would be dangerous without the lock held because it would allow interleaving
+        // with the deleteJob and performHealthCheck operations, both of which mutate
+        // self.state.nodes.
+        await assignNodesToJob({ nodes: nodesToAssign, jobName: input.jobName });
+        for (const node of nodesToAssign) {
+          this.state.nodes.set(node, input.jobName);
+        }
+        this.state.maxAssignedNodes = Math.max(this.state.maxAssignedNodes, this.getAssignedNodes().size);
+      }
+      return this.getStateSummary();
+    });
+  }
+
+  async deleteJob(input: DeleteJobUpdateInput) {
+    await wf.condition(() => this.state.clusterStarted);
+    if (this.state.clusterShutdown) {
+      // If you want the client to receive a failure, either add an update validator and throw the
+      // exception from there, or raise an ApplicationError. Other exceptions in the handler will
+      // cause the workflow to keep retrying and get it stuck.
+      throw new wf.ApplicationFailure('Cannot delete job: Cluster is already shut down');
+    }
+    await this.nodesMutex.runExclusive(async () => {
+      const nodesToUnassign = Array.from(this.state.nodes.entries())
+        .filter(([_, v]) => v === input.jobName)
+        .map(([k, _]) => k);
+      // This await would be dangerous without the lock held because it would allow interleaving
+      // with the assignNodesToJob and performHealthCheck operations, both of which mutate
+      // self.state.nodes.
+      await unassignNodesForJob({ nodes: nodesToUnassign, jobName: input.jobName });
+      for (const node of nodesToUnassign) {
+        this.state.nodes.set(node, null);
+      }
+    });
+  }
+
+  async performHealthChecks(): Promise<void> {
+    wf.log.info('performHealthChecks');
+    await this.nodesMutex.runExclusive(async () => {
+      const badNodes = await findBadNodes({ nodesToCheck: Array.from(this.getAssignedNodes()) });
+      for (const node of badNodes) {
+        this.state.nodes.set(node, 'BAD!');
+      }
+    });
+  }
+
+  getState(): ClusterManagerState {
+    return {
+      clusterStarted: this.state.clusterStarted,
+      clusterShutdown: this.state.clusterShutdown,
+      nodes: this.state.nodes,
+      maxAssignedNodes: this.state.maxAssignedNodes,
+    };
+  }
+
+  getStateSummary(): ClusterManagerStateSummary {
+    return {
+      maxAssignedNodes: this.state.maxAssignedNodes,
+      assignedNodes: this.getAssignedNodes().size,
+      badNodes: this.getBadNodes().size,
+    };
+  }
+
+  getUnassignedNodes(): Set<string> {
+    return new Set(Array.from(this.state.nodes.keys()).filter((key) => this.state.nodes.get(key) === null));
+  }
+
+  getBadNodes(): Set<string> {
+    return new Set(Array.from(this.state.nodes.keys()).filter((key) => this.state.nodes.get(key) === 'BAD!'));
+  }
+
+  getAssignedNodes(jobName?: string): Set<string> {
+    return new Set(
+      Array.from(this.state.nodes.keys()).filter((key) => {
+        const value = this.state.nodes.get(key);
+        return jobName ? value === jobName : value !== null && value !== 'BAD!';
+      })
+    );
+  }
+}
@@ -0,0 +1,36 @@
+import { WorkflowHandle } from '@temporalio/client';
+
+import { assignNodesToJobUpdate, startClusterSignal, deleteJobUpdate, shutdownClusterSignal } from './workflows';
+import { startClusterManager } from './client';
+
+async function runSimulation(wf: WorkflowHandle, delaySeconds?: number): Promise<void> {
+  await wf.signal(startClusterSignal);
+
+  const allocationUpdates: Promise<any>[] = [];
+  for (let i = 0; i < 6; i++) {
+    allocationUpdates.push(wf.executeUpdate(assignNodesToJobUpdate, { args: [{ numNodes: 2, jobName: `task-${i}` }] }));
+  }
+  await Promise.all(allocationUpdates);
+
+  if (delaySeconds) {
+    await new Promise((resolve) => setTimeout(resolve, delaySeconds * 1000));
+  }
+
+  const deletionUpdates: Promise<any>[] = [];
+  for (let i = 0; i < 6; i++) {
+    deletionUpdates.push(wf.executeUpdate(deleteJobUpdate, { args: [{ jobName: `task-${i}` }] }));
+  }
+  await Promise.all(deletionUpdates);
+
+  await wf.signal(shutdownClusterSignal);
+}
+
+async function main() {
+  const workflow = await startClusterManager();
+  await runSimulation(workflow);
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
@@ -0,0 +1,65 @@
+import {
+  assignNodesToJobUpdate,
+  startClusterSignal,
+  shutdownClusterSignal,
+  deleteJobUpdate,
+  getClusterStatusQuery,
+} from './workflows';
+import { startClusterManager } from './client';
+import assert from 'assert';
+import { ClusterManagerStateSummary } from './types';
+
+async function testClusterManager() {
+  const workflow = await startClusterManager();
+  await workflow.signal(startClusterSignal);
+  const request1 = {
+    numNodes: 5,
+    jobName: 'job1',
+  };
+
+  // Use an update to assign nodes.
+  const updateResult1 = await workflow.executeUpdate(assignNodesToJobUpdate, {
+    args: [request1],
+  });
+  assert.equal(updateResult1.assignedNodes, request1.numNodes);
+  assert.equal(updateResult1.maxAssignedNodes, request1.numNodes);
+
+  // Assign nodes to a job and then delete it
+  const request2 = {
+    numNodes: 6,
+    jobName: 'job2',
+  };
+  const updateResult2 = await workflow.executeUpdate(assignNodesToJobUpdate, {
+    args: [request2],
+  });
+  assert.equal(updateResult2.assignedNodes, request1.numNodes + request2.numNodes);
+  assert.equal(updateResult2.maxAssignedNodes, request1.numNodes + request2.numNodes);
+
+  await workflow.executeUpdate(deleteJobUpdate, { args: [{ jobName: 'job2' }] });
+
+  // The delete doesn't return anything; use the query to get current cluster state
+  const queryResult = await workflow.query(getClusterStatusQuery);
+  assert.equal(
+    queryResult.assignedNodes,
+    request1.numNodes,
+    `expected ${request1.numNodes} left after deleting ${request2.numNodes}`
+  );
+  assert.equal(queryResult.maxAssignedNodes, request1.numNodes + request2.numNodes);
+
+  // Terminate the workflow and check that workflow returns same value as obtained from last query.
+  await workflow.signal(shutdownClusterSignal);
+  const wfResult = await workflow.result();
+  assert.deepEqual(wfResult, queryResult);
+}
+
+async function runTests() {
+  for (const fn of [testClusterManager]) {
+    console.log(fn.name);
+    await fn();
+  }
+}
+
+runTests().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});