|
| 1 | +import * as wf from '@temporalio/workflow'; |
| 2 | +import type * as activities from './activities'; |
| 3 | +import { Mutex } from 'async-mutex'; |
| 4 | +import { |
| 5 | + AssignNodesToJobUpdateInput, |
| 6 | + ClusterManagerState, |
| 7 | + ClusterManagerStateSummary, |
| 8 | + DeleteJobUpdateInput, |
| 9 | +} from './types'; |
| 10 | + |
| 11 | +const { assignNodesToJob, unassignNodesForJob, startCluster } = wf.proxyActivities<typeof activities>({ |
| 12 | + startToCloseTimeout: '1 minute', |
| 13 | +}); |
| 14 | + |
| 15 | +// ClusterManagerWorkflow keeps track of the job assignments of a cluster of nodes. It exposes an |
| 16 | +// API to started and shutdown the cluster, to assign jobs to nodes, to delete jobs, and to query |
| 17 | +// cluster status. The workflow maps this API to Signals, Updates, and Queries. The assign and |
| 18 | +// delete operations issue an RPC changing the state of the remote cluster, and then mutate workflow |
| 19 | +// state reflecting the change made. In order that workflow state remains in sync with the true |
| 20 | +// cluster state, assign/delete operations must not be performed concurrently (i.e. they must not |
| 21 | +// "interleave" with each other; they must be "serialized"; they must be "atomic"). An async mutex |
| 22 | +// from a 3rd party library is used to ensure this. |
| 23 | +export class ClusterManager { |
| 24 | + state: ClusterManagerState; |
| 25 | + seenJobs: Set<string>; |
| 26 | + nodesMutex: Mutex; |
| 27 | + |
| 28 | + constructor(state?: ClusterManagerState) { |
| 29 | + this.state = state ?? { |
| 30 | + clusterStarted: false, |
| 31 | + clusterShutdown: false, |
| 32 | + nodes: new Map<string, string | null>(), |
| 33 | + maxAssignedNodes: 0, |
| 34 | + }; |
| 35 | + this.nodesMutex = new Mutex(); |
| 36 | + this.seenJobs = new Set<string>(); |
| 37 | + } |
| 38 | + |
| 39 | + async startCluster(): Promise<void> { |
| 40 | + await startCluster(); |
| 41 | + this.state.clusterStarted = true; |
| 42 | + for (let i = 0; i < 25; i++) { |
| 43 | + this.state.nodes.set(i.toString(), null); |
| 44 | + } |
| 45 | + wf.log.info('Cluster started'); |
| 46 | + } |
| 47 | + |
| 48 | + async shutDownCluster(): Promise<void> { |
| 49 | + await wf.condition(() => this.state.clusterStarted); |
| 50 | + this.state.clusterShutdown = true; |
| 51 | + wf.log.info('Cluster shutdown'); |
| 52 | + } |
| 53 | + |
| 54 | + async assignNodesToJob(input: AssignNodesToJobUpdateInput): Promise<ClusterManagerStateSummary> { |
| 55 | + await wf.condition(() => this.state.clusterStarted); |
| 56 | + if (this.state.clusterShutdown) { |
| 57 | + // If you want the client to receive a failure, either add an update validator and throw the |
| 58 | + // exception from there, or raise an ApplicationError. Other exceptions in the handler will |
| 59 | + // cause the workflow to keep retrying and get it stuck. |
| 60 | + throw new wf.ApplicationFailure('Cannot assign nodes to a job: Cluster is already shut down'); |
| 61 | + } |
| 62 | + return await this.nodesMutex.runExclusive(async (): Promise<ClusterManagerStateSummary> => { |
| 63 | + // Idempotency guard: do nothing if the job already has nodes assigned. |
| 64 | + if (!this.seenJobs.has(input.jobName)) { |
| 65 | + const unassignedNodes = this.getUnassignedNodes(); |
| 66 | + if (input.numNodes > unassignedNodes.size) { |
| 67 | + throw new wf.ApplicationFailure( |
| 68 | + `Cannot assign ${input.numNodes} nodes; have only ${unassignedNodes.size} available` |
| 69 | + ); |
| 70 | + } |
| 71 | + const nodesToAssign = Array.from(unassignedNodes).slice(0, input.numNodes); |
| 72 | + // This await would be dangerous without the lock held because it would allow interleaving |
| 73 | + // with the deleteJob operation, which mutates self.state.nodes. |
| 74 | + await assignNodesToJob({ nodes: nodesToAssign, jobName: input.jobName }); |
| 75 | + for (const node of nodesToAssign) { |
| 76 | + this.state.nodes.set(node, input.jobName); |
| 77 | + } |
| 78 | + this.seenJobs.add(input.jobName); |
| 79 | + this.state.maxAssignedNodes = Math.max(this.state.maxAssignedNodes, this.getAssignedNodes().size); |
| 80 | + } |
| 81 | + return this.getStateSummary(); |
| 82 | + }); |
| 83 | + } |
| 84 | + |
| 85 | + async deleteJob(input: DeleteJobUpdateInput) { |
| 86 | + await wf.condition(() => this.state.clusterStarted); |
| 87 | + if (this.state.clusterShutdown) { |
| 88 | + // If you want the client to receive a failure, either add an update validator and throw the |
| 89 | + // exception from there, or raise an ApplicationError. Other exceptions in the handler will |
| 90 | + // cause the workflow to keep retrying and get it stuck. |
| 91 | + throw new wf.ApplicationFailure('Cannot delete job: Cluster is already shut down'); |
| 92 | + } |
| 93 | + await this.nodesMutex.runExclusive(async () => { |
| 94 | + const nodesToUnassign = Array.from(this.state.nodes.entries()) |
| 95 | + .filter(([_, v]) => v === input.jobName) |
| 96 | + .map(([k, _]) => k); |
| 97 | + // This await would be dangerous without the lock held because it would allow interleaving |
| 98 | + // with the assignNodesToJob operation, which mutates self.state.nodes. |
| 99 | + await unassignNodesForJob({ nodes: nodesToUnassign, jobName: input.jobName }); |
| 100 | + for (const node of nodesToUnassign) { |
| 101 | + this.state.nodes.set(node, null); |
| 102 | + } |
| 103 | + }); |
| 104 | + } |
| 105 | + |
| 106 | + getState(): ClusterManagerState { |
| 107 | + return { |
| 108 | + clusterStarted: this.state.clusterStarted, |
| 109 | + clusterShutdown: this.state.clusterShutdown, |
| 110 | + nodes: this.state.nodes, |
| 111 | + maxAssignedNodes: this.state.maxAssignedNodes, |
| 112 | + }; |
| 113 | + } |
| 114 | + |
| 115 | + getStateSummary(): ClusterManagerStateSummary { |
| 116 | + return { |
| 117 | + maxAssignedNodes: this.state.maxAssignedNodes, |
| 118 | + assignedNodes: this.getAssignedNodes().size, |
| 119 | + }; |
| 120 | + } |
| 121 | + |
| 122 | + getUnassignedNodes(): Set<string> { |
| 123 | + return new Set(Array.from(this.state.nodes.keys()).filter((key) => this.state.nodes.get(key) === null)); |
| 124 | + } |
| 125 | + |
| 126 | + getAssignedNodes(jobName?: string): Set<string> { |
| 127 | + return new Set( |
| 128 | + Array.from(this.state.nodes.keys()).filter((key) => { |
| 129 | + const value = this.state.nodes.get(key); |
| 130 | + if (jobName === undefined) { |
| 131 | + return value !== null && value !== 'BAD!'; |
| 132 | + } |
| 133 | + return value === jobName; |
| 134 | + }) |
| 135 | + ); |
| 136 | + } |
| 137 | +} |
0 commit comments