From 5d0307d47a4ef6123aef0f3fe04d49b9c0b44f4c Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Tue, 18 Jun 2024 18:05:03 -0700 Subject: [PATCH 01/28] Atomic message handlers sample --- .../atomic_message_handlers.py | 195 ++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 update_and_signal_handlers/atomic_message_handlers.py diff --git a/update_and_signal_handlers/atomic_message_handlers.py b/update_and_signal_handlers/atomic_message_handlers.py new file mode 100644 index 00000000..6cb78953 --- /dev/null +++ b/update_and_signal_handlers/atomic_message_handlers.py @@ -0,0 +1,195 @@ +import asyncio +from datetime import timedelta +import logging +from typing import Dict, List, Optional + +from temporalio import activity, common, workflow +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + +# This samples shows off the key concurrent programming primitives for Workflows, especially +# useful for workflow that receive signals and updates. + +# - Making signal and update handlers only operate when the workflow is within a certain state +# (here between cluster_started and cluster_shutdown) using workflow.wait_condition. +# - Signal and update handlers can block and their actions can be interleaved with one another and with the main workflow. +# Here, we use a lock to protect shared state from interleaved access. +# - Running start_workflow with an initializer signal that you want to run before anything else. +# +@activity.defn +async def allocate_nodes_to_job(nodes: List[int], task_name: str) -> List[int]: + print(f"Assigning nodes {nodes} to job {task_name}") + await asyncio.sleep(0.1) + +@activity.defn +async def deallocate_nodes_for_job(nodes: List[int], task_name: str) -> List[int]: + print(f"Deallocating nodes {nodes} from job {task_name}") + await asyncio.sleep(0.1) + +@activity.defn +async def find_bad_nodes(nodes: List[int]) -> List[int]: + await asyncio.sleep(0.1) + bad_nodes = [n for n in nodes if n % 5 == 0] + if bad_nodes: + print(f"Found bad nodes: {bad_nodes}") + return bad_nodes + +# ClusterManager keeps track of the allocations of a cluster of nodes. +# Via signals, the cluster can be started and shutdown. +# Via updates, clients can also assign jobs to nodes, resize jobs, and delete jobs. +# These updates must run atomically. +@workflow.defn +class ClusterManager: + def __init__(self) -> None: + self.cluster_started = False + self.cluster_shutdown = False + # Protects workflow state from interleaved access + self.nodes_lock = asyncio.Lock() + + @workflow.signal + async def start_cluster(self): + self.cluster_started = True + self.nodes : Dict[Optional[str]] = dict([(k, None) for k in range(25)]) + workflow.logger.info("Cluster started") + + @workflow.signal + async def shutdown_cluster(self): + await workflow.wait_condition(lambda: self.cluster_started) + self.cluster_shutdown = True + workflow.logger.info("Cluster shut down") + + @workflow.update + async def allocate_n_nodes_to_job(self, task_name: str, num_nodes: int, ) -> List[int]: + await workflow.wait_condition(lambda: self.cluster_started) + assert not self.cluster_shutdown + + await self.nodes_lock.acquire() + try: + unassigned_nodes = [k for k, v in self.nodes.items() if v is None] + if len(unassigned_nodes) < num_nodes: + raise ValueError(f"Cannot allocate {num_nodes} nodes; have only {len(unassigned_nodes)} available") + assigned_nodes = unassigned_nodes[:num_nodes] + # This await would be dangerous without nodes_lock because it yields control and allows interleaving. + await self._allocate_nodes_to_job(assigned_nodes, task_name) + return assigned_nodes + finally: + self.nodes_lock.release() + + + async def _allocate_nodes_to_job(self, assigned_nodes: List[int], task_name: str) -> List[int]: + await workflow.execute_activity( + allocate_nodes_to_job, args=[assigned_nodes, task_name], start_to_close_timeout=timedelta(seconds=10) + ) + for node in assigned_nodes: + self.nodes[node] = task_name + + @workflow.update + async def delete_job(self, task_name: str) -> str: + await workflow.wait_condition(lambda: self.cluster_started) + assert not self.cluster_shutdown + await self.nodes_lock.acquire() + try: + nodes_to_free = [k for k, v in self.nodes.items() if v == task_name] + # This await would be dangerous without nodes_lock because it yields control and allows interleaving. + await self._deallocate_nodes_for_job(nodes_to_free, task_name) + return "Done" + finally: + self.nodes_lock.release() + + async def _deallocate_nodes_for_job(self, nodes_to_free: List[int], task_name: str) -> List[int]: + await workflow.execute_activity( + deallocate_nodes_for_job, args=[nodes_to_free, task_name], start_to_close_timeout=timedelta(seconds=10) + ) + for node in nodes_to_free: + self.nodes[node] = None + + @workflow.update + async def resize_job(self, task_name: str, new_size: int) -> List[int]: + await workflow.wait_condition(lambda: self.cluster_started) + assert not self.cluster_shutdown + await self.nodes_lock.acquire() + try: + allocated_nodes = [k for k, v in self.nodes.items() if v == task_name] + delta = new_size - len(allocated_nodes) + if delta == 0: + return allocated_nodes + elif delta > 0: + unassigned_nodes = [k for k, v in self.nodes.items() if v is None] + if len(unassigned_nodes) < delta: + raise ValueError(f"Cannot allocate {delta} nodes; have only {len(unassigned_nodes)} available") + nodes_to_assign = unassigned_nodes[:delta] + await self._allocate_nodes_to_job(nodes_to_assign, task_name) + return allocated_nodes + nodes_to_assign + else: + nodes_to_deallocate = allocated_nodes[delta:] + await self._deallocate_nodes_for_job(nodes_to_deallocate, task_name) + return list(filter(lambda x: x not in nodes_to_deallocate, allocated_nodes)) + finally: + self.nodes_lock.release() + + async def perform_health_checks(self): + await self.nodes_lock.acquire() + try: + assigned_nodes = [k for k, v in self.nodes.items() if v is not None] + bad_nodes = await workflow.execute_activity(find_bad_nodes, assigned_nodes, start_to_close_timeout=timedelta(seconds=10)) + for node in bad_nodes: + self.nodes[node] = "BAD!" + finally: + self.nodes_lock.release() + + @workflow.run + async def run(self): + await workflow.wait_condition(lambda: self.cluster_started) + + # Perform health checks at intervals + while True: + try: + await workflow.wait_condition(lambda: self.cluster_shutdown, timeout=timedelta(seconds=1)) + except asyncio.TimeoutError: + pass + await self.perform_health_checks() + + # Now we can start allocating jobs to nodes + await workflow.wait_condition(lambda: self.cluster_shutdown) + +async def do_cluster_lifecycle(wf: WorkflowHandle): + allocation_updates = [] + for i in range(6): + allocation_updates.append(wf.execute_update(ClusterManager.allocate_n_nodes_to_job, args=[f"task-{i}", 2])) + await asyncio.gather(*allocation_updates) + + resize_updates = [] + for i in range(6): + resize_updates.append(wf.execute_update(ClusterManager.resize_job, args=[f"task-{i}", 4])) + await asyncio.gather(*resize_updates) + + deletion_updates = [] + for i in range(6): + deletion_updates.append(wf.execute_update(ClusterManager.delete_job, f"task-{i}")) + await asyncio.gather(*deletion_updates) + + await wf.signal(ClusterManager.shutdown_cluster) + print("Cluster shut down") + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="tq", + workflows=[ClusterManager], + activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + ): + wf = await client.start_workflow( + ClusterManager.run, + id="wid2", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + start_signal='start_cluster', + + ) + await do_cluster_lifecycle(wf) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) From bca534a61b4f2ef1adc3bd8869b8df37cb9538c9 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Tue, 18 Jun 2024 18:17:58 -0700 Subject: [PATCH 02/28] Remove resize jobs to reduce code size --- .../atomic_message_handlers.py | 32 ++----------------- 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/update_and_signal_handlers/atomic_message_handlers.py b/update_and_signal_handlers/atomic_message_handlers.py index 6cb78953..1f7fc816 100644 --- a/update_and_signal_handlers/atomic_message_handlers.py +++ b/update_and_signal_handlers/atomic_message_handlers.py @@ -36,7 +36,7 @@ async def find_bad_nodes(nodes: List[int]) -> List[int]: # ClusterManager keeps track of the allocations of a cluster of nodes. # Via signals, the cluster can be started and shutdown. -# Via updates, clients can also assign jobs to nodes, resize jobs, and delete jobs. +# Via updates, clients can also assign jobs to nodes and delete jobs. # These updates must run atomically. @workflow.defn class ClusterManager: @@ -103,34 +103,11 @@ async def _deallocate_nodes_for_job(self, nodes_to_free: List[int], task_name: s for node in nodes_to_free: self.nodes[node] = None - @workflow.update - async def resize_job(self, task_name: str, new_size: int) -> List[int]: - await workflow.wait_condition(lambda: self.cluster_started) - assert not self.cluster_shutdown - await self.nodes_lock.acquire() - try: - allocated_nodes = [k for k, v in self.nodes.items() if v == task_name] - delta = new_size - len(allocated_nodes) - if delta == 0: - return allocated_nodes - elif delta > 0: - unassigned_nodes = [k for k, v in self.nodes.items() if v is None] - if len(unassigned_nodes) < delta: - raise ValueError(f"Cannot allocate {delta} nodes; have only {len(unassigned_nodes)} available") - nodes_to_assign = unassigned_nodes[:delta] - await self._allocate_nodes_to_job(nodes_to_assign, task_name) - return allocated_nodes + nodes_to_assign - else: - nodes_to_deallocate = allocated_nodes[delta:] - await self._deallocate_nodes_for_job(nodes_to_deallocate, task_name) - return list(filter(lambda x: x not in nodes_to_deallocate, allocated_nodes)) - finally: - self.nodes_lock.release() - async def perform_health_checks(self): await self.nodes_lock.acquire() try: assigned_nodes = [k for k, v in self.nodes.items() if v is not None] + # This await would be dangerous without nodes_lock because it yields control and allows interleaving. bad_nodes = await workflow.execute_activity(find_bad_nodes, assigned_nodes, start_to_close_timeout=timedelta(seconds=10)) for node in bad_nodes: self.nodes[node] = "BAD!" @@ -158,11 +135,6 @@ async def do_cluster_lifecycle(wf: WorkflowHandle): allocation_updates.append(wf.execute_update(ClusterManager.allocate_n_nodes_to_job, args=[f"task-{i}", 2])) await asyncio.gather(*allocation_updates) - resize_updates = [] - for i in range(6): - resize_updates.append(wf.execute_update(ClusterManager.resize_job, args=[f"task-{i}", 4])) - await asyncio.gather(*resize_updates) - deletion_updates = [] for i in range(6): deletion_updates.append(wf.execute_update(ClusterManager.delete_job, f"task-{i}")) From 8b0a6eda0a37810f985eb0aff3c546601085bad8 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Tue, 18 Jun 2024 18:18:15 -0700 Subject: [PATCH 03/28] Misc polish --- .../atomic_message_handlers.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/update_and_signal_handlers/atomic_message_handlers.py b/update_and_signal_handlers/atomic_message_handlers.py index 1f7fc816..dc4761f5 100644 --- a/update_and_signal_handlers/atomic_message_handlers.py +++ b/update_and_signal_handlers/atomic_message_handlers.py @@ -2,15 +2,16 @@ from datetime import timedelta import logging from typing import Dict, List, Optional +import uuid from temporalio import activity, common, workflow from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker # This samples shows off the key concurrent programming primitives for Workflows, especially -# useful for workflow that receive signals and updates. +# useful for workflows that handle signals and updates. -# - Making signal and update handlers only operate when the workflow is within a certain state +# - Makes signal and update handlers only operate when the workflow is within a certain state # (here between cluster_started and cluster_shutdown) using workflow.wait_condition. # - Signal and update handlers can block and their actions can be interleaved with one another and with the main workflow. # Here, we use a lock to protect shared state from interleaved access. @@ -124,6 +125,8 @@ async def run(self): await workflow.wait_condition(lambda: self.cluster_shutdown, timeout=timedelta(seconds=1)) except asyncio.TimeoutError: pass + if self.cluster_shutdown: + break await self.perform_health_checks() # Now we can start allocating jobs to nodes @@ -152,15 +155,18 @@ async def main(): workflows=[ClusterManager], activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], ): - wf = await client.start_workflow( + cluster_manager_handle = await client.start_workflow( ClusterManager.run, - id="wid2", + id=f"ClusterManager-{uuid.uuid4()}", task_queue="tq", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, start_signal='start_cluster', ) - await do_cluster_lifecycle(wf) + await do_cluster_lifecycle(cluster_manager_handle) + await cluster_manager_handle.result() + + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) From fb7b32fd9aab9b067283cb1c2b53fe3ca28c3445 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Tue, 18 Jun 2024 18:39:04 -0700 Subject: [PATCH 04/28] Add test --- .../atomic_message_handlers_test.py | 26 +++++++++++++++++++ .../atomic_message_handlers.py | 10 +++++-- 2 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 tests/update_and_signal_handlers/atomic_message_handlers_test.py diff --git a/tests/update_and_signal_handlers/atomic_message_handlers_test.py b/tests/update_and_signal_handlers/atomic_message_handlers_test.py new file mode 100644 index 00000000..c1c14858 --- /dev/null +++ b/tests/update_and_signal_handlers/atomic_message_handlers_test.py @@ -0,0 +1,26 @@ +import uuid + +from temporalio import workflow, common +from temporalio.client import Client +from temporalio.worker import Worker +from update_and_signal_handlers.atomic_message_handlers import ClusterManager, allocate_nodes_to_job, deallocate_nodes_for_job, do_cluster_lifecycle, find_bad_nodes + + +async def test_atomic_message_handlers(client: Client): + async with Worker( + client, + task_queue="tq", + workflows=[ClusterManager], + activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + ): + cluster_manager_handle = await client.start_workflow( + ClusterManager.run, + id=f"ClusterManager-{uuid.uuid4()}", + task_queue="tq", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + start_signal='start_cluster', + + ) + await do_cluster_lifecycle(cluster_manager_handle) + max_assigned_nodes = await cluster_manager_handle.result() + assert max_assigned_nodes == 12 diff --git a/update_and_signal_handlers/atomic_message_handlers.py b/update_and_signal_handlers/atomic_message_handlers.py index dc4761f5..4ec43d72 100644 --- a/update_and_signal_handlers/atomic_message_handlers.py +++ b/update_and_signal_handlers/atomic_message_handlers.py @@ -46,6 +46,7 @@ def __init__(self) -> None: self.cluster_shutdown = False # Protects workflow state from interleaved access self.nodes_lock = asyncio.Lock() + self.max_assigned_nodes = 0 @workflow.signal async def start_cluster(self): @@ -72,6 +73,9 @@ async def allocate_n_nodes_to_job(self, task_name: str, num_nodes: int, ) -> Lis assigned_nodes = unassigned_nodes[:num_nodes] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. await self._allocate_nodes_to_job(assigned_nodes, task_name) + self.max_assigned_nodes = max( + self.max_assigned_nodes, + len([k for k, v in self.nodes.items() if v is not None])) return assigned_nodes finally: self.nodes_lock.release() @@ -112,6 +116,7 @@ async def perform_health_checks(self): bad_nodes = await workflow.execute_activity(find_bad_nodes, assigned_nodes, start_to_close_timeout=timedelta(seconds=10)) for node in bad_nodes: self.nodes[node] = "BAD!" + self.num_assigned_nodes = len(assigned_nodes) finally: self.nodes_lock.release() @@ -131,6 +136,7 @@ async def run(self): # Now we can start allocating jobs to nodes await workflow.wait_condition(lambda: self.cluster_shutdown) + return self.max_assigned_nodes async def do_cluster_lifecycle(wf: WorkflowHandle): allocation_updates = [] @@ -144,7 +150,6 @@ async def do_cluster_lifecycle(wf: WorkflowHandle): await asyncio.gather(*deletion_updates) await wf.signal(ClusterManager.shutdown_cluster) - print("Cluster shut down") async def main(): client = await Client.connect("localhost:7233") @@ -164,7 +169,8 @@ async def main(): ) await do_cluster_lifecycle(cluster_manager_handle) - await cluster_manager_handle.result() + max_assigned_nodes = await cluster_manager_handle.result() + print(f"Cluster shut down successfully. It peaked at {max_assigned_nodes} assigned nodes.") From 42d1f12c7a76672df30a99101a3da68d70145f42 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Wed, 19 Jun 2024 08:30:05 -0700 Subject: [PATCH 05/28] Format code --- .../atomic_message_handlers_test.py | 14 ++- .../atomic_message_handlers.py | 85 +++++++++++++------ 2 files changed, 68 insertions(+), 31 deletions(-) diff --git a/tests/update_and_signal_handlers/atomic_message_handlers_test.py b/tests/update_and_signal_handlers/atomic_message_handlers_test.py index c1c14858..f694a4d3 100644 --- a/tests/update_and_signal_handlers/atomic_message_handlers_test.py +++ b/tests/update_and_signal_handlers/atomic_message_handlers_test.py @@ -1,9 +1,16 @@ import uuid -from temporalio import workflow, common +from temporalio import common, workflow from temporalio.client import Client from temporalio.worker import Worker -from update_and_signal_handlers.atomic_message_handlers import ClusterManager, allocate_nodes_to_job, deallocate_nodes_for_job, do_cluster_lifecycle, find_bad_nodes + +from update_and_signal_handlers.atomic_message_handlers import ( + ClusterManager, + allocate_nodes_to_job, + deallocate_nodes_for_job, + do_cluster_lifecycle, + find_bad_nodes, +) async def test_atomic_message_handlers(client: Client): @@ -18,8 +25,7 @@ async def test_atomic_message_handlers(client: Client): id=f"ClusterManager-{uuid.uuid4()}", task_queue="tq", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - start_signal='start_cluster', - + start_signal="start_cluster", ) await do_cluster_lifecycle(cluster_manager_handle) max_assigned_nodes = await cluster_manager_handle.result() diff --git a/update_and_signal_handlers/atomic_message_handlers.py b/update_and_signal_handlers/atomic_message_handlers.py index 4ec43d72..8d14c254 100644 --- a/update_and_signal_handlers/atomic_message_handlers.py +++ b/update_and_signal_handlers/atomic_message_handlers.py @@ -1,17 +1,17 @@ import asyncio -from datetime import timedelta import logging -from typing import Dict, List, Optional import uuid +from datetime import timedelta +from typing import Dict, List, Optional from temporalio import activity, common, workflow from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker -# This samples shows off the key concurrent programming primitives for Workflows, especially +# This samples shows off the key concurrent programming primitives for Workflows, especially # useful for workflows that handle signals and updates. -# - Makes signal and update handlers only operate when the workflow is within a certain state +# - Makes signal and update handlers only operate when the workflow is within a certain state # (here between cluster_started and cluster_shutdown) using workflow.wait_condition. # - Signal and update handlers can block and their actions can be interleaved with one another and with the main workflow. # Here, we use a lock to protect shared state from interleaved access. @@ -22,11 +22,13 @@ async def allocate_nodes_to_job(nodes: List[int], task_name: str) -> List[int]: print(f"Assigning nodes {nodes} to job {task_name}") await asyncio.sleep(0.1) + @activity.defn async def deallocate_nodes_for_job(nodes: List[int], task_name: str) -> List[int]: print(f"Deallocating nodes {nodes} from job {task_name}") await asyncio.sleep(0.1) + @activity.defn async def find_bad_nodes(nodes: List[int]) -> List[int]: await asyncio.sleep(0.1) @@ -35,7 +37,8 @@ async def find_bad_nodes(nodes: List[int]) -> List[int]: print(f"Found bad nodes: {bad_nodes}") return bad_nodes -# ClusterManager keeps track of the allocations of a cluster of nodes. + +# ClusterManager keeps track of the allocations of a cluster of nodes. # Via signals, the cluster can be started and shutdown. # Via updates, clients can also assign jobs to nodes and delete jobs. # These updates must run atomically. @@ -51,7 +54,7 @@ def __init__(self) -> None: @workflow.signal async def start_cluster(self): self.cluster_started = True - self.nodes : Dict[Optional[str]] = dict([(k, None) for k in range(25)]) + self.nodes: Dict[Optional[str]] = dict([(k, None) for k in range(25)]) workflow.logger.info("Cluster started") @workflow.signal @@ -61,7 +64,11 @@ async def shutdown_cluster(self): workflow.logger.info("Cluster shut down") @workflow.update - async def allocate_n_nodes_to_job(self, task_name: str, num_nodes: int, ) -> List[int]: + async def allocate_n_nodes_to_job( + self, + task_name: str, + num_nodes: int, + ) -> List[int]: await workflow.wait_condition(lambda: self.cluster_started) assert not self.cluster_shutdown @@ -69,21 +76,27 @@ async def allocate_n_nodes_to_job(self, task_name: str, num_nodes: int, ) -> Lis try: unassigned_nodes = [k for k, v in self.nodes.items() if v is None] if len(unassigned_nodes) < num_nodes: - raise ValueError(f"Cannot allocate {num_nodes} nodes; have only {len(unassigned_nodes)} available") + raise ValueError( + f"Cannot allocate {num_nodes} nodes; have only {len(unassigned_nodes)} available" + ) assigned_nodes = unassigned_nodes[:num_nodes] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. await self._allocate_nodes_to_job(assigned_nodes, task_name) self.max_assigned_nodes = max( - self.max_assigned_nodes, - len([k for k, v in self.nodes.items() if v is not None])) + self.max_assigned_nodes, + len([k for k, v in self.nodes.items() if v is not None]), + ) return assigned_nodes finally: self.nodes_lock.release() - - async def _allocate_nodes_to_job(self, assigned_nodes: List[int], task_name: str) -> List[int]: + async def _allocate_nodes_to_job( + self, assigned_nodes: List[int], task_name: str + ) -> List[int]: await workflow.execute_activity( - allocate_nodes_to_job, args=[assigned_nodes, task_name], start_to_close_timeout=timedelta(seconds=10) + allocate_nodes_to_job, + args=[assigned_nodes, task_name], + start_to_close_timeout=timedelta(seconds=10), ) for node in assigned_nodes: self.nodes[node] = task_name @@ -96,14 +109,18 @@ async def delete_job(self, task_name: str) -> str: try: nodes_to_free = [k for k, v in self.nodes.items() if v == task_name] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. - await self._deallocate_nodes_for_job(nodes_to_free, task_name) + await self._deallocate_nodes_for_job(nodes_to_free, task_name) return "Done" finally: self.nodes_lock.release() - async def _deallocate_nodes_for_job(self, nodes_to_free: List[int], task_name: str) -> List[int]: + async def _deallocate_nodes_for_job( + self, nodes_to_free: List[int], task_name: str + ) -> List[int]: await workflow.execute_activity( - deallocate_nodes_for_job, args=[nodes_to_free, task_name], start_to_close_timeout=timedelta(seconds=10) + deallocate_nodes_for_job, + args=[nodes_to_free, task_name], + start_to_close_timeout=timedelta(seconds=10), ) for node in nodes_to_free: self.nodes[node] = None @@ -113,7 +130,11 @@ async def perform_health_checks(self): try: assigned_nodes = [k for k, v in self.nodes.items() if v is not None] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. - bad_nodes = await workflow.execute_activity(find_bad_nodes, assigned_nodes, start_to_close_timeout=timedelta(seconds=10)) + bad_nodes = await workflow.execute_activity( + find_bad_nodes, + assigned_nodes, + start_to_close_timeout=timedelta(seconds=10), + ) for node in bad_nodes: self.nodes[node] = "BAD!" self.num_assigned_nodes = len(assigned_nodes) @@ -127,7 +148,9 @@ async def run(self): # Perform health checks at intervals while True: try: - await workflow.wait_condition(lambda: self.cluster_shutdown, timeout=timedelta(seconds=1)) + await workflow.wait_condition( + lambda: self.cluster_shutdown, timeout=timedelta(seconds=1) + ) except asyncio.TimeoutError: pass if self.cluster_shutdown: @@ -138,19 +161,27 @@ async def run(self): await workflow.wait_condition(lambda: self.cluster_shutdown) return self.max_assigned_nodes + async def do_cluster_lifecycle(wf: WorkflowHandle): allocation_updates = [] for i in range(6): - allocation_updates.append(wf.execute_update(ClusterManager.allocate_n_nodes_to_job, args=[f"task-{i}", 2])) - await asyncio.gather(*allocation_updates) - + allocation_updates.append( + wf.execute_update( + ClusterManager.allocate_n_nodes_to_job, args=[f"task-{i}", 2] + ) + ) + await asyncio.gather(*allocation_updates) + deletion_updates = [] for i in range(6): - deletion_updates.append(wf.execute_update(ClusterManager.delete_job, f"task-{i}")) + deletion_updates.append( + wf.execute_update(ClusterManager.delete_job, f"task-{i}") + ) await asyncio.gather(*deletion_updates) - + await wf.signal(ClusterManager.shutdown_cluster) + async def main(): client = await Client.connect("localhost:7233") @@ -165,14 +196,14 @@ async def main(): id=f"ClusterManager-{uuid.uuid4()}", task_queue="tq", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - start_signal='start_cluster', - + start_signal="start_cluster", ) await do_cluster_lifecycle(cluster_manager_handle) max_assigned_nodes = await cluster_manager_handle.result() - print(f"Cluster shut down successfully. It peaked at {max_assigned_nodes} assigned nodes.") + print( + f"Cluster shut down successfully. It peaked at {max_assigned_nodes} assigned nodes." + ) - if __name__ == "__main__": logging.basicConfig(level=logging.INFO) From c96f06dc985f196a66ba53edf705fb9c292a9542 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Thu, 20 Jun 2024 08:19:32 -0700 Subject: [PATCH 06/28] Continue as new --- .../atomic_message_handlers_test.py | 8 +- .../atomic_message_handlers.py | 131 ++++++++++++------ 2 files changed, 95 insertions(+), 44 deletions(-) diff --git a/tests/update_and_signal_handlers/atomic_message_handlers_test.py b/tests/update_and_signal_handlers/atomic_message_handlers_test.py index f694a4d3..078a9666 100644 --- a/tests/update_and_signal_handlers/atomic_message_handlers_test.py +++ b/tests/update_and_signal_handlers/atomic_message_handlers_test.py @@ -26,7 +26,9 @@ async def test_atomic_message_handlers(client: Client): task_queue="tq", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, start_signal="start_cluster", + args=[None, None] ) - await do_cluster_lifecycle(cluster_manager_handle) - max_assigned_nodes = await cluster_manager_handle.result() - assert max_assigned_nodes == 12 + await do_cluster_lifecycle(cluster_manager_handle, delay=1) + result = await cluster_manager_handle.result() + assert result.max_assigned_nodes == 12 + assert result.num_assigned_nodes == 0 diff --git a/update_and_signal_handlers/atomic_message_handlers.py b/update_and_signal_handlers/atomic_message_handlers.py index 8d14c254..c6936807 100644 --- a/update_and_signal_handlers/atomic_message_handlers.py +++ b/update_and_signal_handlers/atomic_message_handlers.py @@ -1,4 +1,5 @@ import asyncio +from dataclasses import dataclass import logging import uuid from datetime import timedelta @@ -14,29 +15,47 @@ # - Makes signal and update handlers only operate when the workflow is within a certain state # (here between cluster_started and cluster_shutdown) using workflow.wait_condition. # - Signal and update handlers can block and their actions can be interleaved with one another and with the main workflow. +# They should also complete before the workflow run completes. # Here, we use a lock to protect shared state from interleaved access. # - Running start_workflow with an initializer signal that you want to run before anything else. +# - An Entity workflow that periodically "continues as new". It must do this to prevent its history from growing too large, # @activity.defn -async def allocate_nodes_to_job(nodes: List[int], task_name: str) -> List[int]: +async def allocate_nodes_to_job(nodes: List[str], task_name: str) -> List[str]: print(f"Assigning nodes {nodes} to job {task_name}") await asyncio.sleep(0.1) @activity.defn -async def deallocate_nodes_for_job(nodes: List[int], task_name: str) -> List[int]: +async def deallocate_nodes_for_job(nodes: List[str], task_name: str) -> List[str]: print(f"Deallocating nodes {nodes} from job {task_name}") await asyncio.sleep(0.1) @activity.defn -async def find_bad_nodes(nodes: List[int]) -> List[int]: +async def find_bad_nodes(nodes: List[str]) -> List[str]: await asyncio.sleep(0.1) - bad_nodes = [n for n in nodes if n % 5 == 0] + bad_nodes = [n for n in nodes if int(n) % 5 == 0] if bad_nodes: print(f"Found bad nodes: {bad_nodes}") + else: + print("No new bad nodes found.") return bad_nodes +# In workflows that continue-as-new, it's convenient to store all your state in one serializable structure +# to make it easier to pass between runs +@dataclass(kw_only=True) +class ClusterManagerState: + cluster_started: bool = False + cluster_shutdown: bool = False + nodes: Optional[Dict[str, Optional[str]]] = None + max_assigned_nodes: int = 0 + num_assigned_nodes: int = 0 + +@dataclass +class ClusterManagerResult: + max_assigned_nodes: int + num_assigned_nodes: int # ClusterManager keeps track of the allocations of a cluster of nodes. # Via signals, the cluster can be started and shutdown. @@ -45,22 +64,20 @@ async def find_bad_nodes(nodes: List[int]) -> List[int]: @workflow.defn class ClusterManager: def __init__(self) -> None: - self.cluster_started = False - self.cluster_shutdown = False + self.state = ClusterManagerState() # Protects workflow state from interleaved access self.nodes_lock = asyncio.Lock() - self.max_assigned_nodes = 0 @workflow.signal async def start_cluster(self): - self.cluster_started = True - self.nodes: Dict[Optional[str]] = dict([(k, None) for k in range(25)]) + self.state.cluster_started = True + self.state.nodes = dict([(str(k), None) for k in range(25)]) workflow.logger.info("Cluster started") @workflow.signal async def shutdown_cluster(self): - await workflow.wait_condition(lambda: self.cluster_started) - self.cluster_shutdown = True + await workflow.wait_condition(lambda: self.state.cluster_started) + self.state.cluster_shutdown = True workflow.logger.info("Cluster shut down") @workflow.update @@ -68,13 +85,13 @@ async def allocate_n_nodes_to_job( self, task_name: str, num_nodes: int, - ) -> List[int]: - await workflow.wait_condition(lambda: self.cluster_started) - assert not self.cluster_shutdown + ) -> List[str]: + await workflow.wait_condition(lambda: self.state.cluster_started) + assert not self.state.cluster_shutdown await self.nodes_lock.acquire() try: - unassigned_nodes = [k for k, v in self.nodes.items() if v is None] + unassigned_nodes = [k for k, v in self.state.nodes.items() if v is None] if len(unassigned_nodes) < num_nodes: raise ValueError( f"Cannot allocate {num_nodes} nodes; have only {len(unassigned_nodes)} available" @@ -82,32 +99,32 @@ async def allocate_n_nodes_to_job( assigned_nodes = unassigned_nodes[:num_nodes] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. await self._allocate_nodes_to_job(assigned_nodes, task_name) - self.max_assigned_nodes = max( - self.max_assigned_nodes, - len([k for k, v in self.nodes.items() if v is not None]), + self.state.max_assigned_nodes = max( + self.state.max_assigned_nodes, + len([k for k, v in self.state.nodes.items() if v is not None]), ) return assigned_nodes finally: self.nodes_lock.release() async def _allocate_nodes_to_job( - self, assigned_nodes: List[int], task_name: str - ) -> List[int]: + self, assigned_nodes: List[str], task_name: str + ) -> List[str]: await workflow.execute_activity( allocate_nodes_to_job, args=[assigned_nodes, task_name], start_to_close_timeout=timedelta(seconds=10), ) for node in assigned_nodes: - self.nodes[node] = task_name + self.state.nodes[node] = task_name @workflow.update async def delete_job(self, task_name: str) -> str: - await workflow.wait_condition(lambda: self.cluster_started) - assert not self.cluster_shutdown + await workflow.wait_condition(lambda: self.state.cluster_started) + assert not self.state.cluster_shutdown await self.nodes_lock.acquire() try: - nodes_to_free = [k for k, v in self.nodes.items() if v == task_name] + nodes_to_free = [k for k, v in self.state.nodes.items() if v == task_name] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. await self._deallocate_nodes_for_job(nodes_to_free, task_name) return "Done" @@ -115,20 +132,20 @@ async def delete_job(self, task_name: str) -> str: self.nodes_lock.release() async def _deallocate_nodes_for_job( - self, nodes_to_free: List[int], task_name: str - ) -> List[int]: + self, nodes_to_free: List[str], task_name: str + ) -> List[str]: await workflow.execute_activity( deallocate_nodes_for_job, args=[nodes_to_free, task_name], start_to_close_timeout=timedelta(seconds=10), ) for node in nodes_to_free: - self.nodes[node] = None + self.state.nodes[node] = None async def perform_health_checks(self): await self.nodes_lock.acquire() try: - assigned_nodes = [k for k, v in self.nodes.items() if v is not None] + assigned_nodes = [k for k, v in self.state.nodes.items() if v is not None and v != "BAD!"] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. bad_nodes = await workflow.execute_activity( find_bad_nodes, @@ -136,33 +153,60 @@ async def perform_health_checks(self): start_to_close_timeout=timedelta(seconds=10), ) for node in bad_nodes: - self.nodes[node] = "BAD!" - self.num_assigned_nodes = len(assigned_nodes) + self.state.nodes[node] = "BAD!" + self.state.num_assigned_nodes = len(assigned_nodes) finally: self.nodes_lock.release() + # The cluster manager is a long-running "entity" workflow so we need to periodically checkpoint its state and + # continue-as-new. + def init_from_previous_run(self, state: Optional[ClusterManagerState], max_history_length: Optional[int]): + if state: + self.state = state + self.max_history_length = max_history_length + + def should_continue_as_new(self): + # We don't want to continue-as-new if we're in the middle of an update + if self.nodes_lock.locked(): + return False + if workflow.info().is_continue_as_new_suggested(): + return True + # This is just for ease-of-testing. In production, we trust temporal to tell us when to continue as new. + if self.max_history_length and workflow.info().get_current_history_length() > self.max_history_length: + return True + return False + + # max_history_size - to more conveniently test continue-as-new, not to be used in production. @workflow.run - async def run(self): - await workflow.wait_condition(lambda: self.cluster_started) + async def run( + self, + state: Optional[ClusterManagerState], + max_history_length: Optional[int]) -> ClusterManagerResult: + self.init_from_previous_run(state, max_history_length) + await workflow.wait_condition(lambda: self.state.cluster_started) - # Perform health checks at intervals + # Perform health checks at intervals. (Waking up so frequently is a bad idea in practice because it will rapidly + # increase your workflow history size.) while True: try: await workflow.wait_condition( - lambda: self.cluster_shutdown, timeout=timedelta(seconds=1) + lambda: self.state.cluster_shutdown or self.should_continue_as_new(), timeout=timedelta(seconds=1) ) except asyncio.TimeoutError: pass - if self.cluster_shutdown: + if self.state.cluster_shutdown: break await self.perform_health_checks() + if self.should_continue_as_new(): + workflow.logger.info("Continuing as new") + await workflow.continue_as_new(args=[self.state, self.max_history_length]) # Now we can start allocating jobs to nodes - await workflow.wait_condition(lambda: self.cluster_shutdown) - return self.max_assigned_nodes + await workflow.wait_condition(lambda: self.state.cluster_shutdown) + return ClusterManagerResult(self.state.max_assigned_nodes, self.state.num_assigned_nodes) -async def do_cluster_lifecycle(wf: WorkflowHandle): +async def do_cluster_lifecycle(wf: WorkflowHandle, delay: Optional[int] = None): allocation_updates = [] for i in range(6): allocation_updates.append( @@ -172,6 +216,9 @@ async def do_cluster_lifecycle(wf: WorkflowHandle): ) await asyncio.gather(*allocation_updates) + if delay: + await asyncio.sleep(delay) + deletion_updates = [] for i in range(6): deletion_updates.append( @@ -193,15 +240,17 @@ async def main(): ): cluster_manager_handle = await client.start_workflow( ClusterManager.run, + args=[None, 150], # max_history_length to conveniently test continue-as-new id=f"ClusterManager-{uuid.uuid4()}", task_queue="tq", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, start_signal="start_cluster", ) - await do_cluster_lifecycle(cluster_manager_handle) - max_assigned_nodes = await cluster_manager_handle.result() + await do_cluster_lifecycle(cluster_manager_handle, delay=1) + result = await cluster_manager_handle.result() print( - f"Cluster shut down successfully. It peaked at {max_assigned_nodes} assigned nodes." + f"Cluster shut down successfully. It peaked at {result.max_assigned_nodes} assigned nodes ."\ + f" It had {result.num_assigned_nodes} nodes assigned at the end." ) From 6944099d56536ffad5c4cf862a7f8804ec967333 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Thu, 20 Jun 2024 08:19:55 -0700 Subject: [PATCH 07/28] Formatting --- .../atomic_message_handlers_test.py | 2 +- .../atomic_message_handlers.py | 41 +++++++++++++------ 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/tests/update_and_signal_handlers/atomic_message_handlers_test.py b/tests/update_and_signal_handlers/atomic_message_handlers_test.py index 078a9666..2284e70c 100644 --- a/tests/update_and_signal_handlers/atomic_message_handlers_test.py +++ b/tests/update_and_signal_handlers/atomic_message_handlers_test.py @@ -26,7 +26,7 @@ async def test_atomic_message_handlers(client: Client): task_queue="tq", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, start_signal="start_cluster", - args=[None, None] + args=[None, None], ) await do_cluster_lifecycle(cluster_manager_handle, delay=1) result = await cluster_manager_handle.result() diff --git a/update_and_signal_handlers/atomic_message_handlers.py b/update_and_signal_handlers/atomic_message_handlers.py index c6936807..3ae5c964 100644 --- a/update_and_signal_handlers/atomic_message_handlers.py +++ b/update_and_signal_handlers/atomic_message_handlers.py @@ -1,7 +1,7 @@ import asyncio -from dataclasses import dataclass import logging import uuid +from dataclasses import dataclass from datetime import timedelta from typing import Dict, List, Optional @@ -42,6 +42,7 @@ async def find_bad_nodes(nodes: List[str]) -> List[str]: print("No new bad nodes found.") return bad_nodes + # In workflows that continue-as-new, it's convenient to store all your state in one serializable structure # to make it easier to pass between runs @dataclass(kw_only=True) @@ -52,11 +53,13 @@ class ClusterManagerState: max_assigned_nodes: int = 0 num_assigned_nodes: int = 0 + @dataclass class ClusterManagerResult: max_assigned_nodes: int num_assigned_nodes: int + # ClusterManager keeps track of the allocations of a cluster of nodes. # Via signals, the cluster can be started and shutdown. # Via updates, clients can also assign jobs to nodes and delete jobs. @@ -145,7 +148,9 @@ async def _deallocate_nodes_for_job( async def perform_health_checks(self): await self.nodes_lock.acquire() try: - assigned_nodes = [k for k, v in self.state.nodes.items() if v is not None and v != "BAD!"] + assigned_nodes = [ + k for k, v in self.state.nodes.items() if v is not None and v != "BAD!" + ] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. bad_nodes = await workflow.execute_activity( find_bad_nodes, @@ -158,9 +163,11 @@ async def perform_health_checks(self): finally: self.nodes_lock.release() - # The cluster manager is a long-running "entity" workflow so we need to periodically checkpoint its state and + # The cluster manager is a long-running "entity" workflow so we need to periodically checkpoint its state and # continue-as-new. - def init_from_previous_run(self, state: Optional[ClusterManagerState], max_history_length: Optional[int]): + def init_from_previous_run( + self, state: Optional[ClusterManagerState], max_history_length: Optional[int] + ): if state: self.state = state self.max_history_length = max_history_length @@ -172,16 +179,18 @@ def should_continue_as_new(self): if workflow.info().is_continue_as_new_suggested(): return True # This is just for ease-of-testing. In production, we trust temporal to tell us when to continue as new. - if self.max_history_length and workflow.info().get_current_history_length() > self.max_history_length: + if ( + self.max_history_length + and workflow.info().get_current_history_length() > self.max_history_length + ): return True return False # max_history_size - to more conveniently test continue-as-new, not to be used in production. @workflow.run async def run( - self, - state: Optional[ClusterManagerState], - max_history_length: Optional[int]) -> ClusterManagerResult: + self, state: Optional[ClusterManagerState], max_history_length: Optional[int] + ) -> ClusterManagerResult: self.init_from_previous_run(state, max_history_length) await workflow.wait_condition(lambda: self.state.cluster_started) @@ -190,7 +199,9 @@ async def run( while True: try: await workflow.wait_condition( - lambda: self.state.cluster_shutdown or self.should_continue_as_new(), timeout=timedelta(seconds=1) + lambda: self.state.cluster_shutdown + or self.should_continue_as_new(), + timeout=timedelta(seconds=1), ) except asyncio.TimeoutError: pass @@ -199,11 +210,15 @@ async def run( await self.perform_health_checks() if self.should_continue_as_new(): workflow.logger.info("Continuing as new") - await workflow.continue_as_new(args=[self.state, self.max_history_length]) + await workflow.continue_as_new( + args=[self.state, self.max_history_length] + ) # Now we can start allocating jobs to nodes await workflow.wait_condition(lambda: self.state.cluster_shutdown) - return ClusterManagerResult(self.state.max_assigned_nodes, self.state.num_assigned_nodes) + return ClusterManagerResult( + self.state.max_assigned_nodes, self.state.num_assigned_nodes + ) async def do_cluster_lifecycle(wf: WorkflowHandle, delay: Optional[int] = None): @@ -240,7 +255,7 @@ async def main(): ): cluster_manager_handle = await client.start_workflow( ClusterManager.run, - args=[None, 150], # max_history_length to conveniently test continue-as-new + args=[None, 150], # max_history_length to conveniently test continue-as-new id=f"ClusterManager-{uuid.uuid4()}", task_queue="tq", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, @@ -249,7 +264,7 @@ async def main(): await do_cluster_lifecycle(cluster_manager_handle, delay=1) result = await cluster_manager_handle.result() print( - f"Cluster shut down successfully. It peaked at {result.max_assigned_nodes} assigned nodes ."\ + f"Cluster shut down successfully. It peaked at {result.max_assigned_nodes} assigned nodes ." f" It had {result.num_assigned_nodes} nodes assigned at the end." ) From ec1fb89714fe9adebb8b69660e1437e03b0a17d9 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Sat, 22 Jun 2024 09:00:38 -0700 Subject: [PATCH 08/28] Feedback, readme, restructure files and directories --- .../atomic_message_handlers_test.py | 20 ++-- .../atomic_message_handlers/README.md | 20 ++++ .../atomic_message_handlers/activities.py | 25 ++++ .../atomic_message_handlers/starter.py | 54 +++++++++ .../atomic_message_handlers/worker.py | 33 ++++++ .../atomic_message_handlers/workflow.py | 107 +----------------- 6 files changed, 148 insertions(+), 111 deletions(-) rename tests/{update_and_signal_handlers => updates_and_signals}/atomic_message_handlers_test.py (60%) create mode 100644 updates_and_signals/atomic_message_handlers/README.md create mode 100644 updates_and_signals/atomic_message_handlers/activities.py create mode 100644 updates_and_signals/atomic_message_handlers/starter.py create mode 100644 updates_and_signals/atomic_message_handlers/worker.py rename update_and_signal_handlers/atomic_message_handlers.py => updates_and_signals/atomic_message_handlers/workflow.py (65%) diff --git a/tests/update_and_signal_handlers/atomic_message_handlers_test.py b/tests/updates_and_signals/atomic_message_handlers_test.py similarity index 60% rename from tests/update_and_signal_handlers/atomic_message_handlers_test.py rename to tests/updates_and_signals/atomic_message_handlers_test.py index 2284e70c..5fc7cbcc 100644 --- a/tests/update_and_signal_handlers/atomic_message_handlers_test.py +++ b/tests/updates_and_signals/atomic_message_handlers_test.py @@ -4,31 +4,31 @@ from temporalio.client import Client from temporalio.worker import Worker -from update_and_signal_handlers.atomic_message_handlers import ( - ClusterManager, +from updates_and_signals.atomic_message_handlers.starter import do_cluster_lifecycle +from updates_and_signals.atomic_message_handlers.workflow import ClusterManagerWorkflow +from updates_and_signals.atomic_message_handlers.activities import ( allocate_nodes_to_job, deallocate_nodes_for_job, - do_cluster_lifecycle, find_bad_nodes, ) async def test_atomic_message_handlers(client: Client): + task_queue = f"tq-{uuid.uuid4()}" async with Worker( client, - task_queue="tq", - workflows=[ClusterManager], + task_queue=task_queue, + workflows=[ClusterManagerWorkflow], activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], ): cluster_manager_handle = await client.start_workflow( - ClusterManager.run, - id=f"ClusterManager-{uuid.uuid4()}", - task_queue="tq", - id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + ClusterManagerWorkflow.run, + id=f"ClusterManagerWorkflow-{uuid.uuid4()}", + task_queue=task_queue, start_signal="start_cluster", args=[None, None], ) - await do_cluster_lifecycle(cluster_manager_handle, delay=1) + await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) result = await cluster_manager_handle.result() assert result.max_assigned_nodes == 12 assert result.num_assigned_nodes == 0 diff --git a/updates_and_signals/atomic_message_handlers/README.md b/updates_and_signals/atomic_message_handlers/README.md new file mode 100644 index 00000000..c808d4c9 --- /dev/null +++ b/updates_and_signals/atomic_message_handlers/README.md @@ -0,0 +1,20 @@ +# Atomic message handlers + +This sample shows off important techniques for handling signals and updates, aka messages. In particular, it illustrates how message handlers can interleave and how you can manage that. + +* Here, using workflow.wait_condition, signal and update handlers will only operate when the workflow is within a certain state--between cluster_started and cluster_shutdown. +* You can run start_workflow with an initializer signal that you want to run before anything else other than the workflow's constructor. This pattern is known as "signal-with-start." +* Message handlers can block and their actions can be interleaved with one another and with the main workflow. This can easily cause bugs, so we use a lock to protect shared state from interleaved access. +* Message handlers should also finish before the workflow run completes. One option is to use a lock. +* An "Entity" workflow, i.e. a long-lived workflow, periodically "continues as new". It must do this to prevent its history from growing too large, and it passes its state to the next workflow. You can check `workflow.info().is_continue_as_new_suggested()` to see when it's time. Just make sure message handlers have finished before doing so. + +To run, first see [README.md](../../README.md) for prerequisites. + +Then, run the following from this directory to run the sample: + +```bash +poetry run python worker.py +poetry run python starter.py +``` + +This will start a worker to run your workflow and activities, then start a ClusterManagerWorkflow and put it through its paces. diff --git a/updates_and_signals/atomic_message_handlers/activities.py b/updates_and_signals/atomic_message_handlers/activities.py new file mode 100644 index 00000000..a3e43b94 --- /dev/null +++ b/updates_and_signals/atomic_message_handlers/activities.py @@ -0,0 +1,25 @@ +import asyncio +from typing import List +from temporalio import activity + +@activity.defn +async def allocate_nodes_to_job(nodes: List[str], task_name: str) -> List[str]: + print(f"Assigning nodes {nodes} to job {task_name}") + await asyncio.sleep(0.1) + + +@activity.defn +async def deallocate_nodes_for_job(nodes: List[str], task_name: str) -> List[str]: + print(f"Deallocating nodes {nodes} from job {task_name}") + await asyncio.sleep(0.1) + + +@activity.defn +async def find_bad_nodes(nodes: List[str]) -> List[str]: + await asyncio.sleep(0.1) + bad_nodes = [n for n in nodes if int(n) % 5 == 0] + if bad_nodes: + print(f"Found bad nodes: {bad_nodes}") + else: + print("No new bad nodes found.") + return bad_nodes diff --git a/updates_and_signals/atomic_message_handlers/starter.py b/updates_and_signals/atomic_message_handlers/starter.py new file mode 100644 index 00000000..947f7a87 --- /dev/null +++ b/updates_and_signals/atomic_message_handlers/starter.py @@ -0,0 +1,54 @@ +import asyncio +import logging +from typing import Optional +import uuid +from temporalio import client, common +from temporalio.client import Client, WorkflowHandle + +from updates_and_signals.atomic_message_handlers.workflow import ClusterManagerWorkflow + +async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] = None): + allocation_updates = [] + for i in range(6): + allocation_updates.append( + wf.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, args=[f"task-{i}", 2] + ) + ) + await asyncio.gather(*allocation_updates) + + if delay_seconds: + await asyncio.sleep(delay_seconds) + + deletion_updates = [] + for i in range(6): + deletion_updates.append( + wf.execute_update(ClusterManagerWorkflow.delete_job, f"task-{i}") + ) + await asyncio.gather(*deletion_updates) + + await wf.signal(ClusterManagerWorkflow.shutdown_cluster) + +async def main(): + # Connect to Temporal + client = await Client.connect("localhost:7233") + + cluster_manager_handle = await client.start_workflow( + ClusterManagerWorkflow.run, + args=[None, 150], # max_history_length to conveniently test continue-as-new + id=f"ClusterManagerWorkflow-{uuid.uuid4()}", + task_queue="atomic-message-handlers-task-queue", + id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, + start_signal="start_cluster", + ) + await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) + result = await cluster_manager_handle.result() + print( + f"Cluster shut down successfully. It peaked at {result.max_assigned_nodes} assigned nodes ." + f" It had {result.num_assigned_nodes} nodes assigned at the end." + ) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + asyncio.run(main()) diff --git a/updates_and_signals/atomic_message_handlers/worker.py b/updates_and_signals/atomic_message_handlers/worker.py new file mode 100644 index 00000000..56e8a5b0 --- /dev/null +++ b/updates_and_signals/atomic_message_handlers/worker.py @@ -0,0 +1,33 @@ +import asyncio +import logging +from temporalio import activity, common, workflow +from temporalio.client import Client, WorkflowHandle +from temporalio.worker import Worker + +from updates_and_signals.atomic_message_handlers.workflow import ClusterManagerWorkflow, allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes + +interrupt_event = asyncio.Event() + +async def main(): + # Connect client + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="atomic-message-handlers-task-queue", + workflows=[ClusterManagerWorkflow], + activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + ): + # Wait until interrupted + logging.info("ClusterManagerWorkflow worker started, ctrl+c to exit") + await interrupt_event.wait() + logging.info("Shutting down") + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + loop = asyncio.new_event_loop() + try: + loop.run_until_complete(main()) + except KeyboardInterrupt: + interrupt_event.set() + loop.run_until_complete(loop.shutdown_asyncgens()) diff --git a/update_and_signal_handlers/atomic_message_handlers.py b/updates_and_signals/atomic_message_handlers/workflow.py similarity index 65% rename from update_and_signal_handlers/atomic_message_handlers.py rename to updates_and_signals/atomic_message_handlers/workflow.py index 3ae5c964..e3d552a0 100644 --- a/update_and_signal_handlers/atomic_message_handlers.py +++ b/updates_and_signals/atomic_message_handlers/workflow.py @@ -9,38 +9,7 @@ from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker -# This samples shows off the key concurrent programming primitives for Workflows, especially -# useful for workflows that handle signals and updates. - -# - Makes signal and update handlers only operate when the workflow is within a certain state -# (here between cluster_started and cluster_shutdown) using workflow.wait_condition. -# - Signal and update handlers can block and their actions can be interleaved with one another and with the main workflow. -# They should also complete before the workflow run completes. -# Here, we use a lock to protect shared state from interleaved access. -# - Running start_workflow with an initializer signal that you want to run before anything else. -# - An Entity workflow that periodically "continues as new". It must do this to prevent its history from growing too large, -# -@activity.defn -async def allocate_nodes_to_job(nodes: List[str], task_name: str) -> List[str]: - print(f"Assigning nodes {nodes} to job {task_name}") - await asyncio.sleep(0.1) - - -@activity.defn -async def deallocate_nodes_for_job(nodes: List[str], task_name: str) -> List[str]: - print(f"Deallocating nodes {nodes} from job {task_name}") - await asyncio.sleep(0.1) - - -@activity.defn -async def find_bad_nodes(nodes: List[str]) -> List[str]: - await asyncio.sleep(0.1) - bad_nodes = [n for n in nodes if int(n) % 5 == 0] - if bad_nodes: - print(f"Found bad nodes: {bad_nodes}") - else: - print("No new bad nodes found.") - return bad_nodes +from updates_and_signals.atomic_message_handlers.activities import allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes # In workflows that continue-as-new, it's convenient to store all your state in one serializable structure @@ -53,19 +22,17 @@ class ClusterManagerState: max_assigned_nodes: int = 0 num_assigned_nodes: int = 0 - @dataclass class ClusterManagerResult: max_assigned_nodes: int num_assigned_nodes: int - -# ClusterManager keeps track of the allocations of a cluster of nodes. +# ClusterManagerWorkflow keeps track of the allocations of a cluster of nodes. # Via signals, the cluster can be started and shutdown. # Via updates, clients can also assign jobs to nodes and delete jobs. # These updates must run atomically. @workflow.defn -class ClusterManager: +class ClusterManagerWorkflow: def __init__(self) -> None: self.state = ClusterManagerState() # Protects workflow state from interleaved access @@ -92,8 +59,7 @@ async def allocate_n_nodes_to_job( await workflow.wait_condition(lambda: self.state.cluster_started) assert not self.state.cluster_shutdown - await self.nodes_lock.acquire() - try: + async with self.nodes_lock: unassigned_nodes = [k for k, v in self.state.nodes.items() if v is None] if len(unassigned_nodes) < num_nodes: raise ValueError( @@ -107,8 +73,6 @@ async def allocate_n_nodes_to_job( len([k for k, v in self.state.nodes.items() if v is not None]), ) return assigned_nodes - finally: - self.nodes_lock.release() async def _allocate_nodes_to_job( self, assigned_nodes: List[str], task_name: str @@ -125,14 +89,11 @@ async def _allocate_nodes_to_job( async def delete_job(self, task_name: str) -> str: await workflow.wait_condition(lambda: self.state.cluster_started) assert not self.state.cluster_shutdown - await self.nodes_lock.acquire() - try: + async with self.nodes_lock: nodes_to_free = [k for k, v in self.state.nodes.items() if v == task_name] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. await self._deallocate_nodes_for_job(nodes_to_free, task_name) return "Done" - finally: - self.nodes_lock.release() async def _deallocate_nodes_for_job( self, nodes_to_free: List[str], task_name: str @@ -146,8 +107,7 @@ async def _deallocate_nodes_for_job( self.state.nodes[node] = None async def perform_health_checks(self): - await self.nodes_lock.acquire() - try: + async with self.nodes_lock: assigned_nodes = [ k for k, v in self.state.nodes.items() if v is not None and v != "BAD!" ] @@ -160,8 +120,6 @@ async def perform_health_checks(self): for node in bad_nodes: self.state.nodes[node] = "BAD!" self.state.num_assigned_nodes = len(assigned_nodes) - finally: - self.nodes_lock.release() # The cluster manager is a long-running "entity" workflow so we need to periodically checkpoint its state and # continue-as-new. @@ -219,56 +177,3 @@ async def run( return ClusterManagerResult( self.state.max_assigned_nodes, self.state.num_assigned_nodes ) - - -async def do_cluster_lifecycle(wf: WorkflowHandle, delay: Optional[int] = None): - allocation_updates = [] - for i in range(6): - allocation_updates.append( - wf.execute_update( - ClusterManager.allocate_n_nodes_to_job, args=[f"task-{i}", 2] - ) - ) - await asyncio.gather(*allocation_updates) - - if delay: - await asyncio.sleep(delay) - - deletion_updates = [] - for i in range(6): - deletion_updates.append( - wf.execute_update(ClusterManager.delete_job, f"task-{i}") - ) - await asyncio.gather(*deletion_updates) - - await wf.signal(ClusterManager.shutdown_cluster) - - -async def main(): - client = await Client.connect("localhost:7233") - - async with Worker( - client, - task_queue="tq", - workflows=[ClusterManager], - activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], - ): - cluster_manager_handle = await client.start_workflow( - ClusterManager.run, - args=[None, 150], # max_history_length to conveniently test continue-as-new - id=f"ClusterManager-{uuid.uuid4()}", - task_queue="tq", - id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - start_signal="start_cluster", - ) - await do_cluster_lifecycle(cluster_manager_handle, delay=1) - result = await cluster_manager_handle.result() - print( - f"Cluster shut down successfully. It peaked at {result.max_assigned_nodes} assigned nodes ." - f" It had {result.num_assigned_nodes} nodes assigned at the end." - ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - asyncio.run(main()) From dd58c647f4717c3ecc9c7bd42c85920976b3dbc3 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Sat, 22 Jun 2024 09:01:53 -0700 Subject: [PATCH 09/28] Format --- .../atomic_message_handlers_test.py | 6 +++--- .../atomic_message_handlers/activities.py | 2 ++ updates_and_signals/atomic_message_handlers/starter.py | 6 +++++- updates_and_signals/atomic_message_handlers/worker.py | 10 +++++++++- .../atomic_message_handlers/workflow.py | 8 +++++++- 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/tests/updates_and_signals/atomic_message_handlers_test.py b/tests/updates_and_signals/atomic_message_handlers_test.py index 5fc7cbcc..186b80e7 100644 --- a/tests/updates_and_signals/atomic_message_handlers_test.py +++ b/tests/updates_and_signals/atomic_message_handlers_test.py @@ -4,13 +4,13 @@ from temporalio.client import Client from temporalio.worker import Worker -from updates_and_signals.atomic_message_handlers.starter import do_cluster_lifecycle -from updates_and_signals.atomic_message_handlers.workflow import ClusterManagerWorkflow -from updates_and_signals.atomic_message_handlers.activities import ( +from updates_and_signals.atomic_message_handlers.activities import ( allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes, ) +from updates_and_signals.atomic_message_handlers.starter import do_cluster_lifecycle +from updates_and_signals.atomic_message_handlers.workflow import ClusterManagerWorkflow async def test_atomic_message_handlers(client: Client): diff --git a/updates_and_signals/atomic_message_handlers/activities.py b/updates_and_signals/atomic_message_handlers/activities.py index a3e43b94..78f7f66a 100644 --- a/updates_and_signals/atomic_message_handlers/activities.py +++ b/updates_and_signals/atomic_message_handlers/activities.py @@ -1,7 +1,9 @@ import asyncio from typing import List + from temporalio import activity + @activity.defn async def allocate_nodes_to_job(nodes: List[str], task_name: str) -> List[str]: print(f"Assigning nodes {nodes} to job {task_name}") diff --git a/updates_and_signals/atomic_message_handlers/starter.py b/updates_and_signals/atomic_message_handlers/starter.py index 947f7a87..892fbd0b 100644 --- a/updates_and_signals/atomic_message_handlers/starter.py +++ b/updates_and_signals/atomic_message_handlers/starter.py @@ -1,12 +1,14 @@ import asyncio import logging -from typing import Optional import uuid +from typing import Optional + from temporalio import client, common from temporalio.client import Client, WorkflowHandle from updates_and_signals.atomic_message_handlers.workflow import ClusterManagerWorkflow + async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] = None): allocation_updates = [] for i in range(6): @@ -29,6 +31,7 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] await wf.signal(ClusterManagerWorkflow.shutdown_cluster) + async def main(): # Connect to Temporal client = await Client.connect("localhost:7233") @@ -48,6 +51,7 @@ async def main(): f" It had {result.num_assigned_nodes} nodes assigned at the end." ) + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) diff --git a/updates_and_signals/atomic_message_handlers/worker.py b/updates_and_signals/atomic_message_handlers/worker.py index 56e8a5b0..fbe00850 100644 --- a/updates_and_signals/atomic_message_handlers/worker.py +++ b/updates_and_signals/atomic_message_handlers/worker.py @@ -1,13 +1,20 @@ import asyncio import logging + from temporalio import activity, common, workflow from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker -from updates_and_signals.atomic_message_handlers.workflow import ClusterManagerWorkflow, allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes +from updates_and_signals.atomic_message_handlers.workflow import ( + ClusterManagerWorkflow, + allocate_nodes_to_job, + deallocate_nodes_for_job, + find_bad_nodes, +) interrupt_event = asyncio.Event() + async def main(): # Connect client client = await Client.connect("localhost:7233") @@ -23,6 +30,7 @@ async def main(): await interrupt_event.wait() logging.info("Shutting down") + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) loop = asyncio.new_event_loop() diff --git a/updates_and_signals/atomic_message_handlers/workflow.py b/updates_and_signals/atomic_message_handlers/workflow.py index e3d552a0..268a6e0d 100644 --- a/updates_and_signals/atomic_message_handlers/workflow.py +++ b/updates_and_signals/atomic_message_handlers/workflow.py @@ -9,7 +9,11 @@ from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker -from updates_and_signals.atomic_message_handlers.activities import allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes +from updates_and_signals.atomic_message_handlers.activities import ( + allocate_nodes_to_job, + deallocate_nodes_for_job, + find_bad_nodes, +) # In workflows that continue-as-new, it's convenient to store all your state in one serializable structure @@ -22,11 +26,13 @@ class ClusterManagerState: max_assigned_nodes: int = 0 num_assigned_nodes: int = 0 + @dataclass class ClusterManagerResult: max_assigned_nodes: int num_assigned_nodes: int + # ClusterManagerWorkflow keeps track of the allocations of a cluster of nodes. # Via signals, the cluster can be started and shutdown. # Via updates, clients can also assign jobs to nodes and delete jobs. From 37e56edfb21115a83bb7fab06a196c46f9547bc4 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Sun, 23 Jun 2024 19:56:29 -0700 Subject: [PATCH 10/28] More feedback. Add test-continue-as-new flag. --- .../atomic_message_handlers_test.py | 7 +- .../atomic_message_handlers/activities.py | 30 +++++-- .../atomic_message_handlers/starter.py | 24 ++++-- .../atomic_message_handlers/workflow.py | 79 +++++++++++-------- 4 files changed, 93 insertions(+), 47 deletions(-) diff --git a/tests/updates_and_signals/atomic_message_handlers_test.py b/tests/updates_and_signals/atomic_message_handlers_test.py index 186b80e7..77017fde 100644 --- a/tests/updates_and_signals/atomic_message_handlers_test.py +++ b/tests/updates_and_signals/atomic_message_handlers_test.py @@ -10,7 +10,10 @@ find_bad_nodes, ) from updates_and_signals.atomic_message_handlers.starter import do_cluster_lifecycle -from updates_and_signals.atomic_message_handlers.workflow import ClusterManagerWorkflow +from updates_and_signals.atomic_message_handlers.workflow import ( + ClusterManagerInput, + ClusterManagerWorkflow, +) async def test_atomic_message_handlers(client: Client): @@ -23,10 +26,10 @@ async def test_atomic_message_handlers(client: Client): ): cluster_manager_handle = await client.start_workflow( ClusterManagerWorkflow.run, + ClusterManagerInput(), id=f"ClusterManagerWorkflow-{uuid.uuid4()}", task_queue=task_queue, start_signal="start_cluster", - args=[None, None], ) await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) result = await cluster_manager_handle.result() diff --git a/updates_and_signals/atomic_message_handlers/activities.py b/updates_and_signals/atomic_message_handlers/activities.py index 78f7f66a..e1c115c8 100644 --- a/updates_and_signals/atomic_message_handlers/activities.py +++ b/updates_and_signals/atomic_message_handlers/activities.py @@ -1,25 +1,43 @@ import asyncio +from dataclasses import dataclass from typing import List from temporalio import activity +@dataclass(kw_only=True) +class AllocateNodesToJobInput: + nodes: List[str] + task_name: str + + @activity.defn -async def allocate_nodes_to_job(nodes: List[str], task_name: str) -> List[str]: - print(f"Assigning nodes {nodes} to job {task_name}") +async def allocate_nodes_to_job(input: AllocateNodesToJobInput) -> List[str]: + print(f"Assigning nodes {input.nodes} to job {input.task_name}") await asyncio.sleep(0.1) +@dataclass(kw_only=True) +class DeallocateNodesForJobInput: + nodes: List[str] + task_name: str + + @activity.defn -async def deallocate_nodes_for_job(nodes: List[str], task_name: str) -> List[str]: - print(f"Deallocating nodes {nodes} from job {task_name}") +async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput) -> List[str]: + print(f"Deallocating nodes {input.nodes} from job {input.task_name}") await asyncio.sleep(0.1) +@dataclass(kw_only=True) +class FindBadNodesInput: + nodes_to_check: List[str] + + @activity.defn -async def find_bad_nodes(nodes: List[str]) -> List[str]: +async def find_bad_nodes(input: FindBadNodesInput) -> List[str]: await asyncio.sleep(0.1) - bad_nodes = [n for n in nodes if int(n) % 5 == 0] + bad_nodes = [n for n in input.nodes_to_check if int(n) % 5 == 0] if bad_nodes: print(f"Found bad nodes: {bad_nodes}") else: diff --git a/updates_and_signals/atomic_message_handlers/starter.py b/updates_and_signals/atomic_message_handlers/starter.py index 892fbd0b..77a59e94 100644 --- a/updates_and_signals/atomic_message_handlers/starter.py +++ b/updates_and_signals/atomic_message_handlers/starter.py @@ -1,3 +1,4 @@ +import argparse import asyncio import logging import uuid @@ -6,7 +7,10 @@ from temporalio import client, common from temporalio.client import Client, WorkflowHandle -from updates_and_signals.atomic_message_handlers.workflow import ClusterManagerWorkflow +from updates_and_signals.atomic_message_handlers.workflow import ( + ClusterManagerInput, + ClusterManagerWorkflow, +) async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] = None): @@ -32,19 +36,20 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] await wf.signal(ClusterManagerWorkflow.shutdown_cluster) -async def main(): +async def main(should_test_continue_as_new: bool): # Connect to Temporal client = await Client.connect("localhost:7233") cluster_manager_handle = await client.start_workflow( ClusterManagerWorkflow.run, - args=[None, 150], # max_history_length to conveniently test continue-as-new + ClusterManagerInput(test_continue_as_new=should_test_continue_as_new), id=f"ClusterManagerWorkflow-{uuid.uuid4()}", task_queue="atomic-message-handlers-task-queue", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, start_signal="start_cluster", ) - await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) + delay_seconds = 10 if should_test_continue_as_new else 1 + await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=delay_seconds) result = await cluster_manager_handle.result() print( f"Cluster shut down successfully. It peaked at {result.max_assigned_nodes} assigned nodes ." @@ -54,5 +59,12 @@ async def main(): if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - - asyncio.run(main()) + parser = argparse.ArgumentParser(description="Atomic message handlers") + parser.add_argument( + "--test-continue-as-new", + help="Make the ClusterManagerWorkflow continue as new before shutting down", + action="store_true", + default=False, + ) + args = parser.parse_args() + asyncio.run(main(args.test_continue_as_new)) diff --git a/updates_and_signals/atomic_message_handlers/workflow.py b/updates_and_signals/atomic_message_handlers/workflow.py index 268a6e0d..83e5504d 100644 --- a/updates_and_signals/atomic_message_handlers/workflow.py +++ b/updates_and_signals/atomic_message_handlers/workflow.py @@ -7,9 +7,13 @@ from temporalio import activity, common, workflow from temporalio.client import Client, WorkflowHandle +from temporalio.common import RetryPolicy from temporalio.worker import Worker from updates_and_signals.atomic_message_handlers.activities import ( + AllocateNodesToJobInput, + DeallocateNodesForJobInput, + FindBadNodesInput, allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes, @@ -27,6 +31,12 @@ class ClusterManagerState: num_assigned_nodes: int = 0 +@dataclass(kw_only=True) +class ClusterManagerInput: + state: Optional[ClusterManagerState] = None + test_continue_as_new: bool = False + + @dataclass class ClusterManagerResult: max_assigned_nodes: int @@ -47,7 +57,7 @@ def __init__(self) -> None: @workflow.signal async def start_cluster(self): self.state.cluster_started = True - self.state.nodes = dict([(str(k), None) for k in range(25)]) + self.state.nodes = {str(k): None for k in range(25)} workflow.logger.info("Cluster started") @workflow.signal @@ -63,7 +73,10 @@ async def allocate_n_nodes_to_job( num_nodes: int, ) -> List[str]: await workflow.wait_condition(lambda: self.state.cluster_started) - assert not self.state.cluster_shutdown + if self.state.cluster_shutdown: + raise RuntimeError( + "Cannot allocate nodes to a job: Cluster is already shut down" + ) async with self.nodes_lock: unassigned_nodes = [k for k, v in self.state.nodes.items() if v is None] @@ -80,12 +93,10 @@ async def allocate_n_nodes_to_job( ) return assigned_nodes - async def _allocate_nodes_to_job( - self, assigned_nodes: List[str], task_name: str - ) -> List[str]: + async def _allocate_nodes_to_job(self, assigned_nodes: List[str], task_name: str): await workflow.execute_activity( allocate_nodes_to_job, - args=[assigned_nodes, task_name], + AllocateNodesToJobInput(nodes=assigned_nodes, task_name=task_name), start_to_close_timeout=timedelta(seconds=10), ) for node in assigned_nodes: @@ -99,42 +110,45 @@ async def delete_job(self, task_name: str) -> str: nodes_to_free = [k for k, v in self.state.nodes.items() if v == task_name] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. await self._deallocate_nodes_for_job(nodes_to_free, task_name) - return "Done" + return "Done" - async def _deallocate_nodes_for_job( - self, nodes_to_free: List[str], task_name: str - ) -> List[str]: + async def _deallocate_nodes_for_job(self, nodes_to_free: List[str], task_name: str): await workflow.execute_activity( deallocate_nodes_for_job, - args=[nodes_to_free, task_name], + DeallocateNodesForJobInput(nodes=nodes_to_free, task_name=task_name), start_to_close_timeout=timedelta(seconds=10), ) for node in nodes_to_free: self.state.nodes[node] = None + def get_assigned_nodes(self) -> List[str]: + return [k for k, v in self.state.nodes.items() if v is not None and v != "BAD!"] + async def perform_health_checks(self): async with self.nodes_lock: - assigned_nodes = [ - k for k, v in self.state.nodes.items() if v is not None and v != "BAD!" - ] + assigned_nodes = self.get_assigned_nodes() # This await would be dangerous without nodes_lock because it yields control and allows interleaving. bad_nodes = await workflow.execute_activity( find_bad_nodes, - assigned_nodes, + FindBadNodesInput(nodes_to_check=assigned_nodes), start_to_close_timeout=timedelta(seconds=10), + # This health check is optional, and our lock would block the whole workflow if we let it retry forever. + retry_policy=RetryPolicy(maximum_attempts=1), ) for node in bad_nodes: self.state.nodes[node] = "BAD!" - self.state.num_assigned_nodes = len(assigned_nodes) # The cluster manager is a long-running "entity" workflow so we need to periodically checkpoint its state and # continue-as-new. - def init_from_previous_run( - self, state: Optional[ClusterManagerState], max_history_length: Optional[int] - ): - if state: - self.state = state - self.max_history_length = max_history_length + def init(self, input: ClusterManagerInput): + if input.state: + self.state = input.state + if input.test_continue_as_new: + self.max_history_length = 120 + self.sleep_interval_seconds = 1 + else: + self.max_history_length = None + self.sleep_interval_seconds = 600 def should_continue_as_new(self): # We don't want to continue-as-new if we're in the middle of an update @@ -152,34 +166,33 @@ def should_continue_as_new(self): # max_history_size - to more conveniently test continue-as-new, not to be used in production. @workflow.run - async def run( - self, state: Optional[ClusterManagerState], max_history_length: Optional[int] - ) -> ClusterManagerResult: - self.init_from_previous_run(state, max_history_length) + async def run(self, input: ClusterManagerInput) -> ClusterManagerResult: + self.init(input) await workflow.wait_condition(lambda: self.state.cluster_started) - # Perform health checks at intervals. (Waking up so frequently is a bad idea in practice because it will rapidly - # increase your workflow history size.) + # Perform health checks at intervals. while True: + await self.perform_health_checks() try: await workflow.wait_condition( lambda: self.state.cluster_shutdown or self.should_continue_as_new(), - timeout=timedelta(seconds=1), + timeout=timedelta(seconds=self.sleep_interval_seconds), ) except asyncio.TimeoutError: pass + self.state.num_assigned_nodes = len(self.get_assigned_nodes()) if self.state.cluster_shutdown: break - await self.perform_health_checks() if self.should_continue_as_new(): workflow.logger.info("Continuing as new") await workflow.continue_as_new( - args=[self.state, self.max_history_length] + ClusterManagerInput( + state=self.state, + test_continue_as_new=input.test_continue_as_new, + ) ) - # Now we can start allocating jobs to nodes - await workflow.wait_condition(lambda: self.state.cluster_shutdown) return ClusterManagerResult( self.state.max_assigned_nodes, self.state.num_assigned_nodes ) From a1506b1dfbbc1307a5ca2b3843b89a00608a651c Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Mon, 24 Jun 2024 10:38:02 -0700 Subject: [PATCH 11/28] Feedback; throw ApplicationFailures from update handlers --- README.md | 1 + .../atomic_message_handlers_test.py | 43 +++++++++++++-- .../atomic_message_handlers/activities.py | 4 +- .../atomic_message_handlers/starter.py | 16 ++++-- .../atomic_message_handlers/workflow.py | 52 +++++++++++++------ 5 files changed, 89 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 5eb48d46..3f2b2ace 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ Some examples require extra dependencies. See each sample's directory for specif * [hello_signal](hello/hello_signal.py) - Send signals to a workflow. * [activity_worker](activity_worker) - Use Python activities from a workflow in another language. +* [atomic_message_handlers](updates_and_signals/atomic_message_handlers/) - Safely handling updates and signals. * [bedrock](bedrock) - Orchestrate a chatbot with Amazon Bedrock. * [cloud_export_to_parquet](cloud_export_to_parquet) - Set up schedule workflow to process exported files on an hourly basis * [context_propagation](context_propagation) - Context propagation through workflows/activities via interceptor. diff --git a/tests/updates_and_signals/atomic_message_handlers_test.py b/tests/updates_and_signals/atomic_message_handlers_test.py index 77017fde..6268c5ad 100644 --- a/tests/updates_and_signals/atomic_message_handlers_test.py +++ b/tests/updates_and_signals/atomic_message_handlers_test.py @@ -1,7 +1,7 @@ import uuid from temporalio import common, workflow -from temporalio.client import Client +from temporalio.client import Client, WorkflowUpdateFailedError from temporalio.worker import Worker from updates_and_signals.atomic_message_handlers.activities import ( @@ -11,6 +11,7 @@ ) from updates_and_signals.atomic_message_handlers.starter import do_cluster_lifecycle from updates_and_signals.atomic_message_handlers.workflow import ( + ClusterManagerAllocateNNodesToJobInput, ClusterManagerInput, ClusterManagerWorkflow, ) @@ -28,10 +29,44 @@ async def test_atomic_message_handlers(client: Client): ClusterManagerWorkflow.run, ClusterManagerInput(), id=f"ClusterManagerWorkflow-{uuid.uuid4()}", - task_queue=task_queue, - start_signal="start_cluster", + task_queue=task_queue ) await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) result = await cluster_manager_handle.result() assert result.max_assigned_nodes == 12 - assert result.num_assigned_nodes == 0 + assert result.num_currently_assigned_nodes == 0 + +async def test_update_failure(client: Client): + task_queue = f"tq-{uuid.uuid4()}" + async with Worker( + client, + task_queue=task_queue, + workflows=[ClusterManagerWorkflow], + activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + ): + cluster_manager_handle = await client.start_workflow( + ClusterManagerWorkflow.run, + ClusterManagerInput(), + id=f"ClusterManagerWorkflow-{uuid.uuid4()}", + task_queue=task_queue, + ) + + await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) + + await cluster_manager_handle.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput(num_nodes=24, task_name=f"big-task") + ) + try: + # Try to allocate too many nodes + await cluster_manager_handle.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput(num_nodes=3, task_name=f"little-task") + ) + except WorkflowUpdateFailedError as e: + assert e.cause.message == "Cannot allocate 3 nodes; have only 1 available" + finally: + await cluster_manager_handle.signal(ClusterManagerWorkflow.shutdown_cluster) + result = await cluster_manager_handle.result() + assert result.num_currently_assigned_nodes == 24 + diff --git a/updates_and_signals/atomic_message_handlers/activities.py b/updates_and_signals/atomic_message_handlers/activities.py index e1c115c8..a1648fee 100644 --- a/updates_and_signals/atomic_message_handlers/activities.py +++ b/updates_and_signals/atomic_message_handlers/activities.py @@ -12,7 +12,7 @@ class AllocateNodesToJobInput: @activity.defn -async def allocate_nodes_to_job(input: AllocateNodesToJobInput) -> List[str]: +async def allocate_nodes_to_job(input: AllocateNodesToJobInput): print(f"Assigning nodes {input.nodes} to job {input.task_name}") await asyncio.sleep(0.1) @@ -24,7 +24,7 @@ class DeallocateNodesForJobInput: @activity.defn -async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput) -> List[str]: +async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput): print(f"Deallocating nodes {input.nodes} from job {input.task_name}") await asyncio.sleep(0.1) diff --git a/updates_and_signals/atomic_message_handlers/starter.py b/updates_and_signals/atomic_message_handlers/starter.py index 77a59e94..c88c8d96 100644 --- a/updates_and_signals/atomic_message_handlers/starter.py +++ b/updates_and_signals/atomic_message_handlers/starter.py @@ -8,17 +8,23 @@ from temporalio.client import Client, WorkflowHandle from updates_and_signals.atomic_message_handlers.workflow import ( + ClusterManagerAllocateNNodesToJobInput, + ClusterManagerDeleteJobInput, ClusterManagerInput, ClusterManagerWorkflow, ) async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] = None): + + await wf.signal(ClusterManagerWorkflow.start_cluster) + allocation_updates = [] for i in range(6): allocation_updates.append( wf.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, args=[f"task-{i}", 2] + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput(num_nodes=2, task_name=f"task-{i}") ) ) await asyncio.gather(*allocation_updates) @@ -29,7 +35,10 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] deletion_updates = [] for i in range(6): deletion_updates.append( - wf.execute_update(ClusterManagerWorkflow.delete_job, f"task-{i}") + wf.execute_update( + ClusterManagerWorkflow.delete_job, + ClusterManagerDeleteJobInput(task_name=f"task-{i}") + ) ) await asyncio.gather(*deletion_updates) @@ -46,14 +55,13 @@ async def main(should_test_continue_as_new: bool): id=f"ClusterManagerWorkflow-{uuid.uuid4()}", task_queue="atomic-message-handlers-task-queue", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, - start_signal="start_cluster", ) delay_seconds = 10 if should_test_continue_as_new else 1 await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=delay_seconds) result = await cluster_manager_handle.result() print( f"Cluster shut down successfully. It peaked at {result.max_assigned_nodes} assigned nodes ." - f" It had {result.num_assigned_nodes} nodes assigned at the end." + f" It had {result.num_currently_assigned_nodes} nodes assigned at the end." ) diff --git a/updates_and_signals/atomic_message_handlers/workflow.py b/updates_and_signals/atomic_message_handlers/workflow.py index 83e5504d..b8c736d4 100644 --- a/updates_and_signals/atomic_message_handlers/workflow.py +++ b/updates_and_signals/atomic_message_handlers/workflow.py @@ -8,6 +8,7 @@ from temporalio import activity, common, workflow from temporalio.client import Client, WorkflowHandle from temporalio.common import RetryPolicy +from temporalio.exceptions import ApplicationError from temporalio.worker import Worker from updates_and_signals.atomic_message_handlers.activities import ( @@ -28,7 +29,6 @@ class ClusterManagerState: cluster_shutdown: bool = False nodes: Optional[Dict[str, Optional[str]]] = None max_assigned_nodes: int = 0 - num_assigned_nodes: int = 0 @dataclass(kw_only=True) @@ -40,8 +40,16 @@ class ClusterManagerInput: @dataclass class ClusterManagerResult: max_assigned_nodes: int - num_assigned_nodes: int + num_currently_assigned_nodes: int +@dataclass(kw_only=True) +class ClusterManagerAllocateNNodesToJobInput: + num_nodes: int + task_name: str + +@dataclass(kw_only=True) +class ClusterManagerDeleteJobInput: + task_name: str # ClusterManagerWorkflow keeps track of the allocations of a cluster of nodes. # Via signals, the cluster can be started and shutdown. @@ -69,24 +77,29 @@ async def shutdown_cluster(self): @workflow.update async def allocate_n_nodes_to_job( self, - task_name: str, - num_nodes: int, + input: ClusterManagerAllocateNNodesToJobInput ) -> List[str]: await workflow.wait_condition(lambda: self.state.cluster_started) if self.state.cluster_shutdown: - raise RuntimeError( + # If you want the client to receive a failure, either add an update validator and throw the + # exception from there, or raise an ApplicationError. Other exceptions in the main handler + # will cause the workflow to keep retrying and get it stuck. + raise ApplicationError( "Cannot allocate nodes to a job: Cluster is already shut down" ) async with self.nodes_lock: unassigned_nodes = [k for k, v in self.state.nodes.items() if v is None] - if len(unassigned_nodes) < num_nodes: - raise ValueError( - f"Cannot allocate {num_nodes} nodes; have only {len(unassigned_nodes)} available" + if len(unassigned_nodes) < input.num_nodes: + # If you want the client to receive a failure, either add an update validator and throw the + # exception from there, or raise an ApplicationError. Other exceptions in the main handler + # will cause the workflow to keep retrying and get it stuck. + raise ApplicationError( + f"Cannot allocate {input.num_nodes} nodes; have only {len(unassigned_nodes)} available" ) - assigned_nodes = unassigned_nodes[:num_nodes] + assigned_nodes = unassigned_nodes[:input.num_nodes] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. - await self._allocate_nodes_to_job(assigned_nodes, task_name) + await self._allocate_nodes_to_job(assigned_nodes, input.task_name) self.state.max_assigned_nodes = max( self.state.max_assigned_nodes, len([k for k, v in self.state.nodes.items() if v is not None]), @@ -103,14 +116,20 @@ async def _allocate_nodes_to_job(self, assigned_nodes: List[str], task_name: str self.state.nodes[node] = task_name @workflow.update - async def delete_job(self, task_name: str) -> str: + async def delete_job(self, input: ClusterManagerDeleteJobInput): await workflow.wait_condition(lambda: self.state.cluster_started) - assert not self.state.cluster_shutdown + if self.state.cluster_shutdown: + # If you want the client to receive a failure, either add an update validator and throw the + # exception from there, or raise an ApplicationError. Other exceptions in the main handler + # will cause the workflow to keep retrying and get it stuck. + raise ApplicationError( + "Cannot delete a job: Cluster is already shut down" + ) + async with self.nodes_lock: - nodes_to_free = [k for k, v in self.state.nodes.items() if v == task_name] + nodes_to_free = [k for k, v in self.state.nodes.items() if v == input.task_name] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. - await self._deallocate_nodes_for_job(nodes_to_free, task_name) - return "Done" + await self._deallocate_nodes_for_job(nodes_to_free, input.task_name) async def _deallocate_nodes_for_job(self, nodes_to_free: List[str], task_name: str): await workflow.execute_activity( @@ -181,7 +200,6 @@ async def run(self, input: ClusterManagerInput) -> ClusterManagerResult: ) except asyncio.TimeoutError: pass - self.state.num_assigned_nodes = len(self.get_assigned_nodes()) if self.state.cluster_shutdown: break if self.should_continue_as_new(): @@ -194,5 +212,5 @@ async def run(self, input: ClusterManagerInput) -> ClusterManagerResult: ) return ClusterManagerResult( - self.state.max_assigned_nodes, self.state.num_assigned_nodes + self.state.max_assigned_nodes, len(self.get_assigned_nodes()) ) From 2cad3dd9bf33a8b3dc00eeddfd064fcedb4a8fe1 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Mon, 24 Jun 2024 10:38:45 -0700 Subject: [PATCH 12/28] Formatting --- .../atomic_message_handlers_test.py | 14 ++++++++------ .../atomic_message_handlers/starter.py | 12 +++++++----- .../atomic_message_handlers/workflow.py | 16 +++++++++------- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/tests/updates_and_signals/atomic_message_handlers_test.py b/tests/updates_and_signals/atomic_message_handlers_test.py index 6268c5ad..d13b0c7e 100644 --- a/tests/updates_and_signals/atomic_message_handlers_test.py +++ b/tests/updates_and_signals/atomic_message_handlers_test.py @@ -29,13 +29,14 @@ async def test_atomic_message_handlers(client: Client): ClusterManagerWorkflow.run, ClusterManagerInput(), id=f"ClusterManagerWorkflow-{uuid.uuid4()}", - task_queue=task_queue + task_queue=task_queue, ) await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) result = await cluster_manager_handle.result() assert result.max_assigned_nodes == 12 assert result.num_currently_assigned_nodes == 0 + async def test_update_failure(client: Client): task_queue = f"tq-{uuid.uuid4()}" async with Worker( @@ -54,14 +55,16 @@ async def test_update_failure(client: Client): await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=24, task_name=f"big-task") + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput(num_nodes=24, task_name=f"big-task"), ) try: # Try to allocate too many nodes await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=3, task_name=f"little-task") + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput( + num_nodes=3, task_name=f"little-task" + ), ) except WorkflowUpdateFailedError as e: assert e.cause.message == "Cannot allocate 3 nodes; have only 1 available" @@ -69,4 +72,3 @@ async def test_update_failure(client: Client): await cluster_manager_handle.signal(ClusterManagerWorkflow.shutdown_cluster) result = await cluster_manager_handle.result() assert result.num_currently_assigned_nodes == 24 - diff --git a/updates_and_signals/atomic_message_handlers/starter.py b/updates_and_signals/atomic_message_handlers/starter.py index c88c8d96..6097012f 100644 --- a/updates_and_signals/atomic_message_handlers/starter.py +++ b/updates_and_signals/atomic_message_handlers/starter.py @@ -16,15 +16,17 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] = None): - + await wf.signal(ClusterManagerWorkflow.start_cluster) allocation_updates = [] for i in range(6): allocation_updates.append( wf.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=2, task_name=f"task-{i}") + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput( + num_nodes=2, task_name=f"task-{i}" + ), ) ) await asyncio.gather(*allocation_updates) @@ -36,8 +38,8 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] for i in range(6): deletion_updates.append( wf.execute_update( - ClusterManagerWorkflow.delete_job, - ClusterManagerDeleteJobInput(task_name=f"task-{i}") + ClusterManagerWorkflow.delete_job, + ClusterManagerDeleteJobInput(task_name=f"task-{i}"), ) ) await asyncio.gather(*deletion_updates) diff --git a/updates_and_signals/atomic_message_handlers/workflow.py b/updates_and_signals/atomic_message_handlers/workflow.py index b8c736d4..54829abf 100644 --- a/updates_and_signals/atomic_message_handlers/workflow.py +++ b/updates_and_signals/atomic_message_handlers/workflow.py @@ -42,15 +42,18 @@ class ClusterManagerResult: max_assigned_nodes: int num_currently_assigned_nodes: int + @dataclass(kw_only=True) class ClusterManagerAllocateNNodesToJobInput: num_nodes: int task_name: str + @dataclass(kw_only=True) class ClusterManagerDeleteJobInput: task_name: str + # ClusterManagerWorkflow keeps track of the allocations of a cluster of nodes. # Via signals, the cluster can be started and shutdown. # Via updates, clients can also assign jobs to nodes and delete jobs. @@ -76,8 +79,7 @@ async def shutdown_cluster(self): @workflow.update async def allocate_n_nodes_to_job( - self, - input: ClusterManagerAllocateNNodesToJobInput + self, input: ClusterManagerAllocateNNodesToJobInput ) -> List[str]: await workflow.wait_condition(lambda: self.state.cluster_started) if self.state.cluster_shutdown: @@ -97,7 +99,7 @@ async def allocate_n_nodes_to_job( raise ApplicationError( f"Cannot allocate {input.num_nodes} nodes; have only {len(unassigned_nodes)} available" ) - assigned_nodes = unassigned_nodes[:input.num_nodes] + assigned_nodes = unassigned_nodes[: input.num_nodes] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. await self._allocate_nodes_to_job(assigned_nodes, input.task_name) self.state.max_assigned_nodes = max( @@ -122,12 +124,12 @@ async def delete_job(self, input: ClusterManagerDeleteJobInput): # If you want the client to receive a failure, either add an update validator and throw the # exception from there, or raise an ApplicationError. Other exceptions in the main handler # will cause the workflow to keep retrying and get it stuck. - raise ApplicationError( - "Cannot delete a job: Cluster is already shut down" - ) + raise ApplicationError("Cannot delete a job: Cluster is already shut down") async with self.nodes_lock: - nodes_to_free = [k for k, v in self.state.nodes.items() if v == input.task_name] + nodes_to_free = [ + k for k, v in self.state.nodes.items() if v == input.task_name + ] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. await self._deallocate_nodes_for_job(nodes_to_free, input.task_name) From d5db7d7b1f9d0a2c3f137fd561738b0034e759ab Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Mon, 24 Jun 2024 13:14:29 -0700 Subject: [PATCH 13/28] __init__.py --- updates_and_signals/atomic_message_handlers/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 updates_and_signals/atomic_message_handlers/__init__.py diff --git a/updates_and_signals/atomic_message_handlers/__init__.py b/updates_and_signals/atomic_message_handlers/__init__.py new file mode 100644 index 00000000..e69de29b From f39841ccbee7e03a7fe05508c3f44659467b6ecb Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Mon, 24 Jun 2024 14:57:56 -0700 Subject: [PATCH 14/28] Fix lint issues --- .../atomic_message_handlers_test.py | 2 ++ updates_and_signals/__init__.py | 0 .../atomic_message_handlers/workflow.py | 23 +++++++++++-------- 3 files changed, 15 insertions(+), 10 deletions(-) create mode 100644 updates_and_signals/__init__.py diff --git a/tests/updates_and_signals/atomic_message_handlers_test.py b/tests/updates_and_signals/atomic_message_handlers_test.py index d13b0c7e..1e52f3f7 100644 --- a/tests/updates_and_signals/atomic_message_handlers_test.py +++ b/tests/updates_and_signals/atomic_message_handlers_test.py @@ -2,6 +2,7 @@ from temporalio import common, workflow from temporalio.client import Client, WorkflowUpdateFailedError +from temporalio.exceptions import ApplicationError from temporalio.worker import Worker from updates_and_signals.atomic_message_handlers.activities import ( @@ -67,6 +68,7 @@ async def test_update_failure(client: Client): ), ) except WorkflowUpdateFailedError as e: + assert isinstance(e.cause, ApplicationError) assert e.cause.message == "Cannot allocate 3 nodes; have only 1 available" finally: await cluster_manager_handle.signal(ClusterManagerWorkflow.shutdown_cluster) diff --git a/updates_and_signals/__init__.py b/updates_and_signals/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/updates_and_signals/atomic_message_handlers/workflow.py b/updates_and_signals/atomic_message_handlers/workflow.py index 54829abf..61125bd1 100644 --- a/updates_and_signals/atomic_message_handlers/workflow.py +++ b/updates_and_signals/atomic_message_handlers/workflow.py @@ -1,4 +1,5 @@ import asyncio +import dataclasses import logging import uuid from dataclasses import dataclass @@ -27,7 +28,7 @@ class ClusterManagerState: cluster_started: bool = False cluster_shutdown: bool = False - nodes: Optional[Dict[str, Optional[str]]] = None + nodes: Dict[str, Optional[str]] = dataclasses.field(default_factory=dict) max_assigned_nodes: int = 0 @@ -64,6 +65,8 @@ def __init__(self) -> None: self.state = ClusterManagerState() # Protects workflow state from interleaved access self.nodes_lock = asyncio.Lock() + self.max_history_length: Optional[int] = None + self.sleep_interval_seconds: int = 600 @workflow.signal async def start_cluster(self): @@ -91,7 +94,7 @@ async def allocate_n_nodes_to_job( ) async with self.nodes_lock: - unassigned_nodes = [k for k, v in self.state.nodes.items() if v is None] + unassigned_nodes = self.get_unassigned_nodes() if len(unassigned_nodes) < input.num_nodes: # If you want the client to receive a failure, either add an update validator and throw the # exception from there, or raise an ApplicationError. Other exceptions in the main handler @@ -99,14 +102,14 @@ async def allocate_n_nodes_to_job( raise ApplicationError( f"Cannot allocate {input.num_nodes} nodes; have only {len(unassigned_nodes)} available" ) - assigned_nodes = unassigned_nodes[: input.num_nodes] + nodes_to_assign = unassigned_nodes[: input.num_nodes] # This await would be dangerous without nodes_lock because it yields control and allows interleaving. - await self._allocate_nodes_to_job(assigned_nodes, input.task_name) + await self._allocate_nodes_to_job(nodes_to_assign, input.task_name) self.state.max_assigned_nodes = max( self.state.max_assigned_nodes, - len([k for k, v in self.state.nodes.items() if v is not None]), + len(self.get_assigned_nodes()), ) - return assigned_nodes + return nodes_to_assign async def _allocate_nodes_to_job(self, assigned_nodes: List[str], task_name: str): await workflow.execute_activity( @@ -142,6 +145,9 @@ async def _deallocate_nodes_for_job(self, nodes_to_free: List[str], task_name: s for node in nodes_to_free: self.state.nodes[node] = None + def get_unassigned_nodes(self) -> List[str]: + return [k for k, v in self.state.nodes.items() if v is None] + def get_assigned_nodes(self) -> List[str]: return [k for k, v in self.state.nodes.items() if v is not None and v != "BAD!"] @@ -167,9 +173,6 @@ def init(self, input: ClusterManagerInput): if input.test_continue_as_new: self.max_history_length = 120 self.sleep_interval_seconds = 1 - else: - self.max_history_length = None - self.sleep_interval_seconds = 600 def should_continue_as_new(self): # We don't want to continue-as-new if we're in the middle of an update @@ -206,7 +209,7 @@ async def run(self, input: ClusterManagerInput) -> ClusterManagerResult: break if self.should_continue_as_new(): workflow.logger.info("Continuing as new") - await workflow.continue_as_new( + workflow.continue_as_new( ClusterManagerInput( state=self.state, test_continue_as_new=input.test_continue_as_new, From 344d6945a1b6bacb414a5fb4ec9167d4c02b7544 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Tue, 25 Jun 2024 09:28:56 -0700 Subject: [PATCH 15/28] Dan Feedback --- .../atomic_message_handlers_test.py | 4 +- .../atomic_message_handlers/activities.py | 8 ++-- .../atomic_message_handlers/starter.py | 4 +- .../atomic_message_handlers/workflow.py | 39 +++++++++++-------- 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/tests/updates_and_signals/atomic_message_handlers_test.py b/tests/updates_and_signals/atomic_message_handlers_test.py index 1e52f3f7..550b2480 100644 --- a/tests/updates_and_signals/atomic_message_handlers_test.py +++ b/tests/updates_and_signals/atomic_message_handlers_test.py @@ -57,14 +57,14 @@ async def test_update_failure(client: Client): await cluster_manager_handle.execute_update( ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=24, task_name=f"big-task"), + ClusterManagerAllocateNNodesToJobInput(num_nodes=24, job_name=f"big-task"), ) try: # Try to allocate too many nodes await cluster_manager_handle.execute_update( ClusterManagerWorkflow.allocate_n_nodes_to_job, ClusterManagerAllocateNNodesToJobInput( - num_nodes=3, task_name=f"little-task" + num_nodes=3, job_name=f"little-task" ), ) except WorkflowUpdateFailedError as e: diff --git a/updates_and_signals/atomic_message_handlers/activities.py b/updates_and_signals/atomic_message_handlers/activities.py index a1648fee..be83b8e3 100644 --- a/updates_and_signals/atomic_message_handlers/activities.py +++ b/updates_and_signals/atomic_message_handlers/activities.py @@ -8,24 +8,24 @@ @dataclass(kw_only=True) class AllocateNodesToJobInput: nodes: List[str] - task_name: str + job_name: str @activity.defn async def allocate_nodes_to_job(input: AllocateNodesToJobInput): - print(f"Assigning nodes {input.nodes} to job {input.task_name}") + print(f"Assigning nodes {input.nodes} to job {input.job_name}") await asyncio.sleep(0.1) @dataclass(kw_only=True) class DeallocateNodesForJobInput: nodes: List[str] - task_name: str + job_name: str @activity.defn async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput): - print(f"Deallocating nodes {input.nodes} from job {input.task_name}") + print(f"Deallocating nodes {input.nodes} from job {input.job_name}") await asyncio.sleep(0.1) diff --git a/updates_and_signals/atomic_message_handlers/starter.py b/updates_and_signals/atomic_message_handlers/starter.py index 6097012f..858dff25 100644 --- a/updates_and_signals/atomic_message_handlers/starter.py +++ b/updates_and_signals/atomic_message_handlers/starter.py @@ -25,7 +25,7 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] wf.execute_update( ClusterManagerWorkflow.allocate_n_nodes_to_job, ClusterManagerAllocateNNodesToJobInput( - num_nodes=2, task_name=f"task-{i}" + num_nodes=2, job_name=f"task-{i}" ), ) ) @@ -39,7 +39,7 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] deletion_updates.append( wf.execute_update( ClusterManagerWorkflow.delete_job, - ClusterManagerDeleteJobInput(task_name=f"task-{i}"), + ClusterManagerDeleteJobInput(job_name=f"task-{i}"), ) ) await asyncio.gather(*deletion_updates) diff --git a/updates_and_signals/atomic_message_handlers/workflow.py b/updates_and_signals/atomic_message_handlers/workflow.py index 61125bd1..a37c58fd 100644 --- a/updates_and_signals/atomic_message_handlers/workflow.py +++ b/updates_and_signals/atomic_message_handlers/workflow.py @@ -47,12 +47,12 @@ class ClusterManagerResult: @dataclass(kw_only=True) class ClusterManagerAllocateNNodesToJobInput: num_nodes: int - task_name: str + job_name: str @dataclass(kw_only=True) class ClusterManagerDeleteJobInput: - task_name: str + job_name: str # ClusterManagerWorkflow keeps track of the allocations of a cluster of nodes. @@ -80,6 +80,9 @@ async def shutdown_cluster(self): self.state.cluster_shutdown = True workflow.logger.info("Cluster shut down") + # This is an update as opposed to a signal because the client may want to wait for nodes to be allocated + # before sending work to those nodes. + # Returns the list of node names that were allocated to the job. @workflow.update async def allocate_n_nodes_to_job( self, input: ClusterManagerAllocateNNodesToJobInput @@ -103,25 +106,28 @@ async def allocate_n_nodes_to_job( f"Cannot allocate {input.num_nodes} nodes; have only {len(unassigned_nodes)} available" ) nodes_to_assign = unassigned_nodes[: input.num_nodes] - # This await would be dangerous without nodes_lock because it yields control and allows interleaving. - await self._allocate_nodes_to_job(nodes_to_assign, input.task_name) + # This await would be dangerous without nodes_lock because it yields control and allows interleaving + # with delete_job and perform_health_checks, which both touch self.state.nodes. + await self._allocate_nodes_to_job(nodes_to_assign, input.job_name) self.state.max_assigned_nodes = max( self.state.max_assigned_nodes, len(self.get_assigned_nodes()), ) return nodes_to_assign - async def _allocate_nodes_to_job(self, assigned_nodes: List[str], task_name: str): + async def _allocate_nodes_to_job(self, assigned_nodes: List[str], job_name: str): await workflow.execute_activity( allocate_nodes_to_job, - AllocateNodesToJobInput(nodes=assigned_nodes, task_name=task_name), + AllocateNodesToJobInput(nodes=assigned_nodes, job_name=job_name), start_to_close_timeout=timedelta(seconds=10), ) for node in assigned_nodes: - self.state.nodes[node] = task_name + self.state.nodes[node] = job_name + # Even though it returns nothing, this is an update because the client may want track it, for example + # to wait for nodes to be deallocated before reassigning them. @workflow.update - async def delete_job(self, input: ClusterManagerDeleteJobInput): + async def delete_job(self, input: ClusterManagerDeleteJobInput) -> None: await workflow.wait_condition(lambda: self.state.cluster_started) if self.state.cluster_shutdown: # If you want the client to receive a failure, either add an update validator and throw the @@ -131,15 +137,16 @@ async def delete_job(self, input: ClusterManagerDeleteJobInput): async with self.nodes_lock: nodes_to_free = [ - k for k, v in self.state.nodes.items() if v == input.task_name + k for k, v in self.state.nodes.items() if v == input.job_name ] - # This await would be dangerous without nodes_lock because it yields control and allows interleaving. - await self._deallocate_nodes_for_job(nodes_to_free, input.task_name) + # This await would be dangerous without nodes_lock because it yields control and allows interleaving + # with allocate_n_nodes_to_job and perform_health_checks, which all touch self.state.nodes. + await self._deallocate_nodes_for_job(nodes_to_free, input.job_name) - async def _deallocate_nodes_for_job(self, nodes_to_free: List[str], task_name: str): + async def _deallocate_nodes_for_job(self, nodes_to_free: List[str], job_name: str): await workflow.execute_activity( deallocate_nodes_for_job, - DeallocateNodesForJobInput(nodes=nodes_to_free, task_name=task_name), + DeallocateNodesForJobInput(nodes=nodes_to_free, job_name=job_name), start_to_close_timeout=timedelta(seconds=10), ) for node in nodes_to_free: @@ -154,7 +161,8 @@ def get_assigned_nodes(self) -> List[str]: async def perform_health_checks(self): async with self.nodes_lock: assigned_nodes = self.get_assigned_nodes() - # This await would be dangerous without nodes_lock because it yields control and allows interleaving. + # This await would be dangerous without nodes_lock because it yields control and allows interleaving + # with allocate_n_nodes_to_job and delete_job, which both touch self.state.nodes. bad_nodes = await workflow.execute_activity( find_bad_nodes, FindBadNodesInput(nodes_to_check=assigned_nodes), @@ -174,7 +182,7 @@ def init(self, input: ClusterManagerInput): self.max_history_length = 120 self.sleep_interval_seconds = 1 - def should_continue_as_new(self): + def should_continue_as_new(self) -> bool: # We don't want to continue-as-new if we're in the middle of an update if self.nodes_lock.locked(): return False @@ -188,7 +196,6 @@ def should_continue_as_new(self): return True return False - # max_history_size - to more conveniently test continue-as-new, not to be used in production. @workflow.run async def run(self, input: ClusterManagerInput) -> ClusterManagerResult: self.init(input) From fc74a695f48684cd463e55a15c46fd1e73a93cb6 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Tue, 25 Jun 2024 09:47:13 -0700 Subject: [PATCH 16/28] More typehints --- .../atomic_message_handlers/workflow.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/updates_and_signals/atomic_message_handlers/workflow.py b/updates_and_signals/atomic_message_handlers/workflow.py index a37c58fd..9e3ac5cb 100644 --- a/updates_and_signals/atomic_message_handlers/workflow.py +++ b/updates_and_signals/atomic_message_handlers/workflow.py @@ -69,13 +69,13 @@ def __init__(self) -> None: self.sleep_interval_seconds: int = 600 @workflow.signal - async def start_cluster(self): + async def start_cluster(self) -> None: self.state.cluster_started = True self.state.nodes = {str(k): None for k in range(25)} workflow.logger.info("Cluster started") @workflow.signal - async def shutdown_cluster(self): + async def shutdown_cluster(self) -> None: await workflow.wait_condition(lambda: self.state.cluster_started) self.state.cluster_shutdown = True workflow.logger.info("Cluster shut down") @@ -115,7 +115,9 @@ async def allocate_n_nodes_to_job( ) return nodes_to_assign - async def _allocate_nodes_to_job(self, assigned_nodes: List[str], job_name: str): + async def _allocate_nodes_to_job( + self, assigned_nodes: List[str], job_name: str + ) -> None: await workflow.execute_activity( allocate_nodes_to_job, AllocateNodesToJobInput(nodes=assigned_nodes, job_name=job_name), @@ -158,7 +160,7 @@ def get_unassigned_nodes(self) -> List[str]: def get_assigned_nodes(self) -> List[str]: return [k for k, v in self.state.nodes.items() if v is not None and v != "BAD!"] - async def perform_health_checks(self): + async def perform_health_checks(self) -> None: async with self.nodes_lock: assigned_nodes = self.get_assigned_nodes() # This await would be dangerous without nodes_lock because it yields control and allows interleaving @@ -175,7 +177,7 @@ async def perform_health_checks(self): # The cluster manager is a long-running "entity" workflow so we need to periodically checkpoint its state and # continue-as-new. - def init(self, input: ClusterManagerInput): + def init(self, input: ClusterManagerInput) -> None: if input.state: self.state = input.state if input.test_continue_as_new: From 0b84c255b99ae4b03caeebf4acf38bac17ae20b1 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Tue, 25 Jun 2024 11:29:59 -0700 Subject: [PATCH 17/28] s/atomic/safe/ --- README.md | 2 +- ...age_handlers_test.py => safe_message_handlers_test.py} | 8 ++++---- .../README.md | 2 +- .../__init__.py | 0 .../activities.py | 0 .../starter.py | 2 +- .../worker.py | 2 +- .../workflow.py | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) rename tests/updates_and_signals/{atomic_message_handlers_test.py => safe_message_handlers_test.py} (90%) rename updates_and_signals/{atomic_message_handlers => safe_message_handlers}/README.md (93%) rename updates_and_signals/{atomic_message_handlers => safe_message_handlers}/__init__.py (100%) rename updates_and_signals/{atomic_message_handlers => safe_message_handlers}/activities.py (100%) rename updates_and_signals/{atomic_message_handlers => safe_message_handlers}/starter.py (97%) rename updates_and_signals/{atomic_message_handlers => safe_message_handlers}/worker.py (94%) rename updates_and_signals/{atomic_message_handlers => safe_message_handlers}/workflow.py (99%) diff --git a/README.md b/README.md index 3f2b2ace..3fdcc157 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Some examples require extra dependencies. See each sample's directory for specif * [hello_signal](hello/hello_signal.py) - Send signals to a workflow. * [activity_worker](activity_worker) - Use Python activities from a workflow in another language. -* [atomic_message_handlers](updates_and_signals/atomic_message_handlers/) - Safely handling updates and signals. +* [safe_message_handlers](updates_and_signals/safe_message_handlers/) - Safely handling updates and signals. * [bedrock](bedrock) - Orchestrate a chatbot with Amazon Bedrock. * [cloud_export_to_parquet](cloud_export_to_parquet) - Set up schedule workflow to process exported files on an hourly basis * [context_propagation](context_propagation) - Context propagation through workflows/activities via interceptor. diff --git a/tests/updates_and_signals/atomic_message_handlers_test.py b/tests/updates_and_signals/safe_message_handlers_test.py similarity index 90% rename from tests/updates_and_signals/atomic_message_handlers_test.py rename to tests/updates_and_signals/safe_message_handlers_test.py index 550b2480..2e2a6ae8 100644 --- a/tests/updates_and_signals/atomic_message_handlers_test.py +++ b/tests/updates_and_signals/safe_message_handlers_test.py @@ -5,20 +5,20 @@ from temporalio.exceptions import ApplicationError from temporalio.worker import Worker -from updates_and_signals.atomic_message_handlers.activities import ( +from updates_and_signals.safe_message_handlers.activities import ( allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes, ) -from updates_and_signals.atomic_message_handlers.starter import do_cluster_lifecycle -from updates_and_signals.atomic_message_handlers.workflow import ( +from updates_and_signals.safe_message_handlers.starter import do_cluster_lifecycle +from updates_and_signals.safe_message_handlers.workflow import ( ClusterManagerAllocateNNodesToJobInput, ClusterManagerInput, ClusterManagerWorkflow, ) -async def test_atomic_message_handlers(client: Client): +async def test_safe_message_handlers(client: Client): task_queue = f"tq-{uuid.uuid4()}" async with Worker( client, diff --git a/updates_and_signals/atomic_message_handlers/README.md b/updates_and_signals/safe_message_handlers/README.md similarity index 93% rename from updates_and_signals/atomic_message_handlers/README.md rename to updates_and_signals/safe_message_handlers/README.md index c808d4c9..d1c7d48f 100644 --- a/updates_and_signals/atomic_message_handlers/README.md +++ b/updates_and_signals/safe_message_handlers/README.md @@ -1,6 +1,6 @@ # Atomic message handlers -This sample shows off important techniques for handling signals and updates, aka messages. In particular, it illustrates how message handlers can interleave and how you can manage that. +This sample shows off important techniques for handling signals and updates, aka messages. In particular, it illustrates how message handlers can interleave or not be completed before the workflow completes, and how you can manage that. * Here, using workflow.wait_condition, signal and update handlers will only operate when the workflow is within a certain state--between cluster_started and cluster_shutdown. * You can run start_workflow with an initializer signal that you want to run before anything else other than the workflow's constructor. This pattern is known as "signal-with-start." diff --git a/updates_and_signals/atomic_message_handlers/__init__.py b/updates_and_signals/safe_message_handlers/__init__.py similarity index 100% rename from updates_and_signals/atomic_message_handlers/__init__.py rename to updates_and_signals/safe_message_handlers/__init__.py diff --git a/updates_and_signals/atomic_message_handlers/activities.py b/updates_and_signals/safe_message_handlers/activities.py similarity index 100% rename from updates_and_signals/atomic_message_handlers/activities.py rename to updates_and_signals/safe_message_handlers/activities.py diff --git a/updates_and_signals/atomic_message_handlers/starter.py b/updates_and_signals/safe_message_handlers/starter.py similarity index 97% rename from updates_and_signals/atomic_message_handlers/starter.py rename to updates_and_signals/safe_message_handlers/starter.py index 858dff25..13d7bab2 100644 --- a/updates_and_signals/atomic_message_handlers/starter.py +++ b/updates_and_signals/safe_message_handlers/starter.py @@ -7,7 +7,7 @@ from temporalio import client, common from temporalio.client import Client, WorkflowHandle -from updates_and_signals.atomic_message_handlers.workflow import ( +from updates_and_signals.safe_message_handlers.workflow import ( ClusterManagerAllocateNNodesToJobInput, ClusterManagerDeleteJobInput, ClusterManagerInput, diff --git a/updates_and_signals/atomic_message_handlers/worker.py b/updates_and_signals/safe_message_handlers/worker.py similarity index 94% rename from updates_and_signals/atomic_message_handlers/worker.py rename to updates_and_signals/safe_message_handlers/worker.py index fbe00850..183cd7ff 100644 --- a/updates_and_signals/atomic_message_handlers/worker.py +++ b/updates_and_signals/safe_message_handlers/worker.py @@ -5,7 +5,7 @@ from temporalio.client import Client, WorkflowHandle from temporalio.worker import Worker -from updates_and_signals.atomic_message_handlers.workflow import ( +from updates_and_signals.safe_message_handlers.workflow import ( ClusterManagerWorkflow, allocate_nodes_to_job, deallocate_nodes_for_job, diff --git a/updates_and_signals/atomic_message_handlers/workflow.py b/updates_and_signals/safe_message_handlers/workflow.py similarity index 99% rename from updates_and_signals/atomic_message_handlers/workflow.py rename to updates_and_signals/safe_message_handlers/workflow.py index 9e3ac5cb..03c4a2ee 100644 --- a/updates_and_signals/atomic_message_handlers/workflow.py +++ b/updates_and_signals/safe_message_handlers/workflow.py @@ -12,7 +12,7 @@ from temporalio.exceptions import ApplicationError from temporalio.worker import Worker -from updates_and_signals.atomic_message_handlers.activities import ( +from updates_and_signals.safe_message_handlers.activities import ( AllocateNodesToJobInput, DeallocateNodesForJobInput, FindBadNodesInput, From c8e90759909252849492a0df2cfe2dd598de4bb4 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Tue, 25 Jun 2024 17:08:16 -0700 Subject: [PATCH 18/28] Fix and demo idempotency --- .../safe_message_handlers_test.py | 33 ++++++++++++++++++- .../safe_message_handlers/README.md | 1 + .../safe_message_handlers/workflow.py | 27 +++++++++++---- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/tests/updates_and_signals/safe_message_handlers_test.py b/tests/updates_and_signals/safe_message_handlers_test.py index 2e2a6ae8..adcc7242 100644 --- a/tests/updates_and_signals/safe_message_handlers_test.py +++ b/tests/updates_and_signals/safe_message_handlers_test.py @@ -38,6 +38,37 @@ async def test_safe_message_handlers(client: Client): assert result.num_currently_assigned_nodes == 0 +async def test_update_idempotency(client: Client): + task_queue = f"tq-{uuid.uuid4()}" + async with Worker( + client, + task_queue=task_queue, + workflows=[ClusterManagerWorkflow], + activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + ): + cluster_manager_handle = await client.start_workflow( + ClusterManagerWorkflow.run, + ClusterManagerInput(), + id=f"ClusterManagerWorkflow-{uuid.uuid4()}", + task_queue=task_queue, + ) + + await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) + + nodes_1 = await cluster_manager_handle.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), + ) + # simulate that in calling it twice, the operation is idempotent + nodes_2 = await cluster_manager_handle.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), + ) + # the second call should not allocate more nodes (it may return fewer if the health check finds bad nodes + # in between the two signals.) + assert nodes_1 >= nodes_2 + + async def test_update_failure(client: Client): task_queue = f"tq-{uuid.uuid4()}" async with Worker( @@ -73,4 +104,4 @@ async def test_update_failure(client: Client): finally: await cluster_manager_handle.signal(ClusterManagerWorkflow.shutdown_cluster) result = await cluster_manager_handle.result() - assert result.num_currently_assigned_nodes == 24 + assert result.num_currently_assigned_nodes + result.num_bad_nodes == 24 diff --git a/updates_and_signals/safe_message_handlers/README.md b/updates_and_signals/safe_message_handlers/README.md index d1c7d48f..6a92206b 100644 --- a/updates_and_signals/safe_message_handlers/README.md +++ b/updates_and_signals/safe_message_handlers/README.md @@ -7,6 +7,7 @@ This sample shows off important techniques for handling signals and updates, aka * Message handlers can block and their actions can be interleaved with one another and with the main workflow. This can easily cause bugs, so we use a lock to protect shared state from interleaved access. * Message handlers should also finish before the workflow run completes. One option is to use a lock. * An "Entity" workflow, i.e. a long-lived workflow, periodically "continues as new". It must do this to prevent its history from growing too large, and it passes its state to the next workflow. You can check `workflow.info().is_continue_as_new_suggested()` to see when it's time. Just make sure message handlers have finished before doing so. +* Message handlers can be made idempotent. See update `ClusterManager.allocate_n_nodes_to_job`. To run, first see [README.md](../../README.md) for prerequisites. diff --git a/updates_and_signals/safe_message_handlers/workflow.py b/updates_and_signals/safe_message_handlers/workflow.py index 03c4a2ee..59c0daab 100644 --- a/updates_and_signals/safe_message_handlers/workflow.py +++ b/updates_and_signals/safe_message_handlers/workflow.py @@ -4,7 +4,7 @@ import uuid from dataclasses import dataclass from datetime import timedelta -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Set from temporalio import activity, common, workflow from temporalio.client import Client, WorkflowHandle @@ -29,6 +29,7 @@ class ClusterManagerState: cluster_started: bool = False cluster_shutdown: bool = False nodes: Dict[str, Optional[str]] = dataclasses.field(default_factory=dict) + jobs_added: Set[str] = dataclasses.field(default_factory=set) max_assigned_nodes: int = 0 @@ -42,6 +43,7 @@ class ClusterManagerInput: class ClusterManagerResult: max_assigned_nodes: int num_currently_assigned_nodes: int + num_bad_nodes: int @dataclass(kw_only=True) @@ -97,6 +99,9 @@ async def allocate_n_nodes_to_job( ) async with self.nodes_lock: + # Idempotency guard. + if input.job_name in self.state.jobs_added: + return self.get_assigned_nodes(job_name=input.job_name) unassigned_nodes = self.get_unassigned_nodes() if len(unassigned_nodes) < input.num_nodes: # If you want the client to receive a failure, either add an update validator and throw the @@ -113,7 +118,7 @@ async def allocate_n_nodes_to_job( self.state.max_assigned_nodes, len(self.get_assigned_nodes()), ) - return nodes_to_assign + return self.get_assigned_nodes(job_name=input.job_name) async def _allocate_nodes_to_job( self, assigned_nodes: List[str], job_name: str @@ -125,6 +130,7 @@ async def _allocate_nodes_to_job( ) for node in assigned_nodes: self.state.nodes[node] = job_name + self.state.jobs_added.add(job_name) # Even though it returns nothing, this is an update because the client may want track it, for example # to wait for nodes to be deallocated before reassigning them. @@ -157,8 +163,16 @@ async def _deallocate_nodes_for_job(self, nodes_to_free: List[str], job_name: st def get_unassigned_nodes(self) -> List[str]: return [k for k, v in self.state.nodes.items() if v is None] - def get_assigned_nodes(self) -> List[str]: - return [k for k, v in self.state.nodes.items() if v is not None and v != "BAD!"] + def get_bad_nodes(self) -> List[str]: + return [k for k, v in self.state.nodes.items() if v == "BAD!"] + + def get_assigned_nodes(self, *, job_name: Optional[str] = None) -> List[str]: + if job_name: + return [k for k, v in self.state.nodes.items() if v == job_name] + else: + return [ + k for k, v in self.state.nodes.items() if v is not None and v != "BAD!" + ] async def perform_health_checks(self) -> None: async with self.nodes_lock: @@ -224,7 +238,8 @@ async def run(self, input: ClusterManagerInput) -> ClusterManagerResult: test_continue_as_new=input.test_continue_as_new, ) ) - return ClusterManagerResult( - self.state.max_assigned_nodes, len(self.get_assigned_nodes()) + self.state.max_assigned_nodes, + len(self.get_assigned_nodes()), + len(self.get_bad_nodes()), ) From 4fc6dace259b5ac562e433ff1700cad4809f8e68 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Tue, 25 Jun 2024 17:13:18 -0700 Subject: [PATCH 19/28] Compatibility with 3.8 --- .../safe_message_handlers/activities.py | 6 +++--- updates_and_signals/safe_message_handlers/workflow.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/updates_and_signals/safe_message_handlers/activities.py b/updates_and_signals/safe_message_handlers/activities.py index be83b8e3..bec506d5 100644 --- a/updates_and_signals/safe_message_handlers/activities.py +++ b/updates_and_signals/safe_message_handlers/activities.py @@ -5,7 +5,7 @@ from temporalio import activity -@dataclass(kw_only=True) +@dataclass class AllocateNodesToJobInput: nodes: List[str] job_name: str @@ -17,7 +17,7 @@ async def allocate_nodes_to_job(input: AllocateNodesToJobInput): await asyncio.sleep(0.1) -@dataclass(kw_only=True) +@dataclass class DeallocateNodesForJobInput: nodes: List[str] job_name: str @@ -29,7 +29,7 @@ async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput): await asyncio.sleep(0.1) -@dataclass(kw_only=True) +@dataclass class FindBadNodesInput: nodes_to_check: List[str] diff --git a/updates_and_signals/safe_message_handlers/workflow.py b/updates_and_signals/safe_message_handlers/workflow.py index 59c0daab..e1cf5368 100644 --- a/updates_and_signals/safe_message_handlers/workflow.py +++ b/updates_and_signals/safe_message_handlers/workflow.py @@ -24,7 +24,7 @@ # In workflows that continue-as-new, it's convenient to store all your state in one serializable structure # to make it easier to pass between runs -@dataclass(kw_only=True) +@dataclass class ClusterManagerState: cluster_started: bool = False cluster_shutdown: bool = False @@ -33,7 +33,7 @@ class ClusterManagerState: max_assigned_nodes: int = 0 -@dataclass(kw_only=True) +@dataclass class ClusterManagerInput: state: Optional[ClusterManagerState] = None test_continue_as_new: bool = False @@ -46,13 +46,13 @@ class ClusterManagerResult: num_bad_nodes: int -@dataclass(kw_only=True) +@dataclass class ClusterManagerAllocateNNodesToJobInput: num_nodes: int job_name: str -@dataclass(kw_only=True) +@dataclass class ClusterManagerDeleteJobInput: job_name: str @@ -132,7 +132,7 @@ async def _allocate_nodes_to_job( self.state.nodes[node] = job_name self.state.jobs_added.add(job_name) - # Even though it returns nothing, this is an update because the client may want track it, for example + # Even though it returns nothing, this is an update because the client may want to track it, for example # to wait for nodes to be deallocated before reassigning them. @workflow.update async def delete_job(self, input: ClusterManagerDeleteJobInput) -> None: From 3ba8882d606e1376e176757f539a04cd9f17bc66 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Wed, 26 Jun 2024 21:37:26 -0700 Subject: [PATCH 20/28] More feedback --- README.md | 2 +- polling/frequent/README.md | 9 +- polling/infrequent/README.md | 10 +- polling/periodic_sequence/README.md | 10 +- .../safe_message_handlers_test.py | 107 ------------------ .../safe_message_handlers/README.md | 11 +- .../safe_message_handlers/activities.py | 4 +- .../safe_message_handlers/starter.py | 2 +- .../safe_message_handlers/worker.py | 2 +- .../safe_message_handlers/workflow.py | 1 - 10 files changed, 28 insertions(+), 130 deletions(-) delete mode 100644 tests/updates_and_signals/safe_message_handlers_test.py diff --git a/README.md b/README.md index 3fdcc157..e3fe046e 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,6 @@ Some examples require extra dependencies. See each sample's directory for specif * [hello_signal](hello/hello_signal.py) - Send signals to a workflow. * [activity_worker](activity_worker) - Use Python activities from a workflow in another language. -* [safe_message_handlers](updates_and_signals/safe_message_handlers/) - Safely handling updates and signals. * [bedrock](bedrock) - Orchestrate a chatbot with Amazon Bedrock. * [cloud_export_to_parquet](cloud_export_to_parquet) - Set up schedule workflow to process exported files on an hourly basis * [context_propagation](context_propagation) - Context propagation through workflows/activities via interceptor. @@ -67,6 +66,7 @@ Some examples require extra dependencies. See each sample's directory for specif * [polling](polling) - Recommended implementation of an activity that needs to periodically poll an external resource waiting its successful completion. * [prometheus](prometheus) - Configure Prometheus metrics on clients/workers. * [pydantic_converter](pydantic_converter) - Data converter for using Pydantic models. +* [safe_message_handlers](updates_and_signals/safe_message_handlers/) - Safely handling updates and signals. * [schedules](schedules) - Demonstrates a Workflow Execution that occurs according to a schedule. * [sentry](sentry) - Report errors to Sentry. * [worker_specific_task_queues](worker_specific_task_queues) - Use unique task queues to ensure activities run on specific workers. diff --git a/polling/frequent/README.md b/polling/frequent/README.md index 33b6bc0a..a54ea267 100644 --- a/polling/frequent/README.md +++ b/polling/frequent/README.md @@ -8,10 +8,11 @@ To run, first see [README.md](../../README.md) for prerequisites. Then, run the following from this directory to run the sample: -```bash -poetry run python run_worker.py -poetry run python run_frequent.py -``` + poetry run python run_worker.py + +Then, in another terminal, run the following to execute the workflow: + + poetry run python run_frequent.py The Workflow will continue to poll the service and heartbeat on every iteration until it succeeds. diff --git a/polling/infrequent/README.md b/polling/infrequent/README.md index 7c0a3225..c1b06d7f 100644 --- a/polling/infrequent/README.md +++ b/polling/infrequent/README.md @@ -13,10 +13,12 @@ To run, first see [README.md](../../README.md) for prerequisites. Then, run the following from this directory to run the sample: -```bash -poetry run python run_worker.py -poetry run python run_infrequent.py -``` + poetry run python run_worker.py + +Then, in another terminal, run the following to execute the workflow: + + poetry run python run_infrequent.py + Since the test service simulates being _down_ for four polling attempts and then returns _OK_ on the fifth poll attempt, the Workflow will perform four Activity retries with a 60-second poll interval, and then return the service result on the successful fifth attempt. diff --git a/polling/periodic_sequence/README.md b/polling/periodic_sequence/README.md index 65fe0669..f8416588 100644 --- a/polling/periodic_sequence/README.md +++ b/polling/periodic_sequence/README.md @@ -8,10 +8,12 @@ To run, first see [README.md](../../README.md) for prerequisites. Then, run the following from this directory to run the sample: -```bash -poetry run python run_worker.py -poetry run python run_periodic.py -``` + poetry run python run_worker.py + +Then, in another terminal, run the following to execute the workflow: + + poetry run python run_periodic.py + This will start a Workflow and Child Workflow to periodically poll an Activity. The Parent Workflow is not aware about the Child Workflow calling Continue-As-New, and it gets notified when it completes (or fails). \ No newline at end of file diff --git a/tests/updates_and_signals/safe_message_handlers_test.py b/tests/updates_and_signals/safe_message_handlers_test.py deleted file mode 100644 index adcc7242..00000000 --- a/tests/updates_and_signals/safe_message_handlers_test.py +++ /dev/null @@ -1,107 +0,0 @@ -import uuid - -from temporalio import common, workflow -from temporalio.client import Client, WorkflowUpdateFailedError -from temporalio.exceptions import ApplicationError -from temporalio.worker import Worker - -from updates_and_signals.safe_message_handlers.activities import ( - allocate_nodes_to_job, - deallocate_nodes_for_job, - find_bad_nodes, -) -from updates_and_signals.safe_message_handlers.starter import do_cluster_lifecycle -from updates_and_signals.safe_message_handlers.workflow import ( - ClusterManagerAllocateNNodesToJobInput, - ClusterManagerInput, - ClusterManagerWorkflow, -) - - -async def test_safe_message_handlers(client: Client): - task_queue = f"tq-{uuid.uuid4()}" - async with Worker( - client, - task_queue=task_queue, - workflows=[ClusterManagerWorkflow], - activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], - ): - cluster_manager_handle = await client.start_workflow( - ClusterManagerWorkflow.run, - ClusterManagerInput(), - id=f"ClusterManagerWorkflow-{uuid.uuid4()}", - task_queue=task_queue, - ) - await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) - result = await cluster_manager_handle.result() - assert result.max_assigned_nodes == 12 - assert result.num_currently_assigned_nodes == 0 - - -async def test_update_idempotency(client: Client): - task_queue = f"tq-{uuid.uuid4()}" - async with Worker( - client, - task_queue=task_queue, - workflows=[ClusterManagerWorkflow], - activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], - ): - cluster_manager_handle = await client.start_workflow( - ClusterManagerWorkflow.run, - ClusterManagerInput(), - id=f"ClusterManagerWorkflow-{uuid.uuid4()}", - task_queue=task_queue, - ) - - await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) - - nodes_1 = await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), - ) - # simulate that in calling it twice, the operation is idempotent - nodes_2 = await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), - ) - # the second call should not allocate more nodes (it may return fewer if the health check finds bad nodes - # in between the two signals.) - assert nodes_1 >= nodes_2 - - -async def test_update_failure(client: Client): - task_queue = f"tq-{uuid.uuid4()}" - async with Worker( - client, - task_queue=task_queue, - workflows=[ClusterManagerWorkflow], - activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], - ): - cluster_manager_handle = await client.start_workflow( - ClusterManagerWorkflow.run, - ClusterManagerInput(), - id=f"ClusterManagerWorkflow-{uuid.uuid4()}", - task_queue=task_queue, - ) - - await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) - - await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=24, job_name=f"big-task"), - ) - try: - # Try to allocate too many nodes - await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput( - num_nodes=3, job_name=f"little-task" - ), - ) - except WorkflowUpdateFailedError as e: - assert isinstance(e.cause, ApplicationError) - assert e.cause.message == "Cannot allocate 3 nodes; have only 1 available" - finally: - await cluster_manager_handle.signal(ClusterManagerWorkflow.shutdown_cluster) - result = await cluster_manager_handle.result() - assert result.num_currently_assigned_nodes + result.num_bad_nodes == 24 diff --git a/updates_and_signals/safe_message_handlers/README.md b/updates_and_signals/safe_message_handlers/README.md index 6a92206b..7dd71d1d 100644 --- a/updates_and_signals/safe_message_handlers/README.md +++ b/updates_and_signals/safe_message_handlers/README.md @@ -11,11 +11,12 @@ This sample shows off important techniques for handling signals and updates, aka To run, first see [README.md](../../README.md) for prerequisites. -Then, run the following from this directory to run the sample: +Then, run the following from this directory to run the worker: +\ + poetry run python worker.py -```bash -poetry run python worker.py -poetry run python starter.py -``` +Then, in another terminal, run the following to execute the workflow: + + poetry run python starter.py This will start a worker to run your workflow and activities, then start a ClusterManagerWorkflow and put it through its paces. diff --git a/updates_and_signals/safe_message_handlers/activities.py b/updates_and_signals/safe_message_handlers/activities.py index bec506d5..e7e07267 100644 --- a/updates_and_signals/safe_message_handlers/activities.py +++ b/updates_and_signals/safe_message_handlers/activities.py @@ -12,7 +12,7 @@ class AllocateNodesToJobInput: @activity.defn -async def allocate_nodes_to_job(input: AllocateNodesToJobInput): +async def allocate_nodes_to_job(input: AllocateNodesToJobInput) -> None: print(f"Assigning nodes {input.nodes} to job {input.job_name}") await asyncio.sleep(0.1) @@ -24,7 +24,7 @@ class DeallocateNodesForJobInput: @activity.defn -async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput): +async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput) -> None: print(f"Deallocating nodes {input.nodes} from job {input.job_name}") await asyncio.sleep(0.1) diff --git a/updates_and_signals/safe_message_handlers/starter.py b/updates_and_signals/safe_message_handlers/starter.py index 13d7bab2..eed63e73 100644 --- a/updates_and_signals/safe_message_handlers/starter.py +++ b/updates_and_signals/safe_message_handlers/starter.py @@ -55,7 +55,7 @@ async def main(should_test_continue_as_new: bool): ClusterManagerWorkflow.run, ClusterManagerInput(test_continue_as_new=should_test_continue_as_new), id=f"ClusterManagerWorkflow-{uuid.uuid4()}", - task_queue="atomic-message-handlers-task-queue", + task_queue="safe-message-handlers-task-queue", id_reuse_policy=common.WorkflowIDReusePolicy.TERMINATE_IF_RUNNING, ) delay_seconds = 10 if should_test_continue_as_new else 1 diff --git a/updates_and_signals/safe_message_handlers/worker.py b/updates_and_signals/safe_message_handlers/worker.py index 183cd7ff..440a8756 100644 --- a/updates_and_signals/safe_message_handlers/worker.py +++ b/updates_and_signals/safe_message_handlers/worker.py @@ -21,7 +21,7 @@ async def main(): async with Worker( client, - task_queue="atomic-message-handlers-task-queue", + task_queue="safe-message-handlers-task-queue", workflows=[ClusterManagerWorkflow], activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], ): diff --git a/updates_and_signals/safe_message_handlers/workflow.py b/updates_and_signals/safe_message_handlers/workflow.py index e1cf5368..513ef39b 100644 --- a/updates_and_signals/safe_message_handlers/workflow.py +++ b/updates_and_signals/safe_message_handlers/workflow.py @@ -216,7 +216,6 @@ def should_continue_as_new(self) -> bool: async def run(self, input: ClusterManagerInput) -> ClusterManagerResult: self.init(input) await workflow.wait_condition(lambda: self.state.cluster_started) - # Perform health checks at intervals. while True: await self.perform_health_checks() From f47369e986ff6d164242a35811bd474e4deecad1 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Thu, 27 Jun 2024 09:47:52 -0700 Subject: [PATCH 21/28] Re-add tests --- .../safe_message_handlers/workflow_test.py | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/updates_and_signals/safe_message_handlers/workflow_test.py diff --git a/tests/updates_and_signals/safe_message_handlers/workflow_test.py b/tests/updates_and_signals/safe_message_handlers/workflow_test.py new file mode 100644 index 00000000..adcc7242 --- /dev/null +++ b/tests/updates_and_signals/safe_message_handlers/workflow_test.py @@ -0,0 +1,107 @@ +import uuid + +from temporalio import common, workflow +from temporalio.client import Client, WorkflowUpdateFailedError +from temporalio.exceptions import ApplicationError +from temporalio.worker import Worker + +from updates_and_signals.safe_message_handlers.activities import ( + allocate_nodes_to_job, + deallocate_nodes_for_job, + find_bad_nodes, +) +from updates_and_signals.safe_message_handlers.starter import do_cluster_lifecycle +from updates_and_signals.safe_message_handlers.workflow import ( + ClusterManagerAllocateNNodesToJobInput, + ClusterManagerInput, + ClusterManagerWorkflow, +) + + +async def test_safe_message_handlers(client: Client): + task_queue = f"tq-{uuid.uuid4()}" + async with Worker( + client, + task_queue=task_queue, + workflows=[ClusterManagerWorkflow], + activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + ): + cluster_manager_handle = await client.start_workflow( + ClusterManagerWorkflow.run, + ClusterManagerInput(), + id=f"ClusterManagerWorkflow-{uuid.uuid4()}", + task_queue=task_queue, + ) + await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) + result = await cluster_manager_handle.result() + assert result.max_assigned_nodes == 12 + assert result.num_currently_assigned_nodes == 0 + + +async def test_update_idempotency(client: Client): + task_queue = f"tq-{uuid.uuid4()}" + async with Worker( + client, + task_queue=task_queue, + workflows=[ClusterManagerWorkflow], + activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + ): + cluster_manager_handle = await client.start_workflow( + ClusterManagerWorkflow.run, + ClusterManagerInput(), + id=f"ClusterManagerWorkflow-{uuid.uuid4()}", + task_queue=task_queue, + ) + + await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) + + nodes_1 = await cluster_manager_handle.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), + ) + # simulate that in calling it twice, the operation is idempotent + nodes_2 = await cluster_manager_handle.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), + ) + # the second call should not allocate more nodes (it may return fewer if the health check finds bad nodes + # in between the two signals.) + assert nodes_1 >= nodes_2 + + +async def test_update_failure(client: Client): + task_queue = f"tq-{uuid.uuid4()}" + async with Worker( + client, + task_queue=task_queue, + workflows=[ClusterManagerWorkflow], + activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + ): + cluster_manager_handle = await client.start_workflow( + ClusterManagerWorkflow.run, + ClusterManagerInput(), + id=f"ClusterManagerWorkflow-{uuid.uuid4()}", + task_queue=task_queue, + ) + + await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) + + await cluster_manager_handle.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput(num_nodes=24, job_name=f"big-task"), + ) + try: + # Try to allocate too many nodes + await cluster_manager_handle.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput( + num_nodes=3, job_name=f"little-task" + ), + ) + except WorkflowUpdateFailedError as e: + assert isinstance(e.cause, ApplicationError) + assert e.cause.message == "Cannot allocate 3 nodes; have only 1 available" + finally: + await cluster_manager_handle.signal(ClusterManagerWorkflow.shutdown_cluster) + result = await cluster_manager_handle.result() + assert result.num_currently_assigned_nodes + result.num_bad_nodes == 24 From 5dc618574b36235e5f5c9ad8630ad14cd4562ef2 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Thu, 27 Jun 2024 10:00:33 -0700 Subject: [PATCH 22/28] Fix flaky test --- .../updates_and_signals/safe_message_handlers/workflow_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/updates_and_signals/safe_message_handlers/workflow_test.py b/tests/updates_and_signals/safe_message_handlers/workflow_test.py index adcc7242..3c9ccf4f 100644 --- a/tests/updates_and_signals/safe_message_handlers/workflow_test.py +++ b/tests/updates_and_signals/safe_message_handlers/workflow_test.py @@ -34,7 +34,7 @@ async def test_safe_message_handlers(client: Client): ) await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) result = await cluster_manager_handle.result() - assert result.max_assigned_nodes == 12 + assert result.max_assigned_nodes + result.num_bad_nodes == 12 assert result.num_currently_assigned_nodes == 0 From 5b45b218c0512c0c79ab018e808855e17bd89d28 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Mon, 8 Jul 2024 12:38:31 -0700 Subject: [PATCH 23/28] Improve update and tests * list -> set to fix a test * Return a struct rather than a raw value from the list for better hygiene * Remove test dependency on race conditions between health check and adding nodes. --- .../safe_message_handlers/workflow_test.py | 40 ++++++++++++++++--- .../safe_message_handlers/activities.py | 8 ++-- .../safe_message_handlers/workflow.py | 33 ++++++++------- 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/tests/updates_and_signals/safe_message_handlers/workflow_test.py b/tests/updates_and_signals/safe_message_handlers/workflow_test.py index 3c9ccf4f..3a4e2e5a 100644 --- a/tests/updates_and_signals/safe_message_handlers/workflow_test.py +++ b/tests/updates_and_signals/safe_message_handlers/workflow_test.py @@ -1,3 +1,4 @@ +import asyncio import uuid from temporalio import common, workflow @@ -13,6 +14,7 @@ from updates_and_signals.safe_message_handlers.starter import do_cluster_lifecycle from updates_and_signals.safe_message_handlers.workflow import ( ClusterManagerAllocateNNodesToJobInput, + ClusterManagerDeleteJobInput, ClusterManagerInput, ClusterManagerWorkflow, ) @@ -32,9 +34,37 @@ async def test_safe_message_handlers(client: Client): id=f"ClusterManagerWorkflow-{uuid.uuid4()}", task_queue=task_queue, ) - await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=1) + await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) + + allocation_updates = [] + for i in range(6): + allocation_updates.append( + cluster_manager_handle.execute_update( + ClusterManagerWorkflow.allocate_n_nodes_to_job, + ClusterManagerAllocateNNodesToJobInput( + num_nodes=2, job_name=f"task-{i}" + ), + ) + ) + results = await asyncio.gather(*allocation_updates) + for result in results: + assert len(result.nodes_assigned) == 2 + + await asyncio.sleep(1) + + deletion_updates = [] + for i in range(6): + deletion_updates.append( + cluster_manager_handle.execute_update( + ClusterManagerWorkflow.delete_job, + ClusterManagerDeleteJobInput(job_name=f"task-{i}"), + ) + ) + await asyncio.gather(*deletion_updates) + + await cluster_manager_handle.signal(ClusterManagerWorkflow.shutdown_cluster) + result = await cluster_manager_handle.result() - assert result.max_assigned_nodes + result.num_bad_nodes == 12 assert result.num_currently_assigned_nodes == 0 @@ -55,18 +85,18 @@ async def test_update_idempotency(client: Client): await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) - nodes_1 = await cluster_manager_handle.execute_update( + result_1 = await cluster_manager_handle.execute_update( ClusterManagerWorkflow.allocate_n_nodes_to_job, ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), ) # simulate that in calling it twice, the operation is idempotent - nodes_2 = await cluster_manager_handle.execute_update( + result_2 = await cluster_manager_handle.execute_update( ClusterManagerWorkflow.allocate_n_nodes_to_job, ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), ) # the second call should not allocate more nodes (it may return fewer if the health check finds bad nodes # in between the two signals.) - assert nodes_1 >= nodes_2 + assert result_1.nodes_assigned >= result_2.nodes_assigned async def test_update_failure(client: Client): diff --git a/updates_and_signals/safe_message_handlers/activities.py b/updates_and_signals/safe_message_handlers/activities.py index e7e07267..5360dfef 100644 --- a/updates_and_signals/safe_message_handlers/activities.py +++ b/updates_and_signals/safe_message_handlers/activities.py @@ -1,6 +1,6 @@ import asyncio from dataclasses import dataclass -from typing import List +from typing import List, Set from temporalio import activity @@ -31,13 +31,13 @@ async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput) -> None: @dataclass class FindBadNodesInput: - nodes_to_check: List[str] + nodes_to_check: Set[str] @activity.defn -async def find_bad_nodes(input: FindBadNodesInput) -> List[str]: +async def find_bad_nodes(input: FindBadNodesInput) -> Set[str]: await asyncio.sleep(0.1) - bad_nodes = [n for n in input.nodes_to_check if int(n) % 5 == 0] + bad_nodes = set([n for n in input.nodes_to_check if int(n) % 5 == 0]) if bad_nodes: print(f"Found bad nodes: {bad_nodes}") else: diff --git a/updates_and_signals/safe_message_handlers/workflow.py b/updates_and_signals/safe_message_handlers/workflow.py index 513ef39b..1ec2e5a2 100644 --- a/updates_and_signals/safe_message_handlers/workflow.py +++ b/updates_and_signals/safe_message_handlers/workflow.py @@ -30,7 +30,6 @@ class ClusterManagerState: cluster_shutdown: bool = False nodes: Dict[str, Optional[str]] = dataclasses.field(default_factory=dict) jobs_added: Set[str] = dataclasses.field(default_factory=set) - max_assigned_nodes: int = 0 @dataclass @@ -41,11 +40,11 @@ class ClusterManagerInput: @dataclass class ClusterManagerResult: - max_assigned_nodes: int num_currently_assigned_nodes: int num_bad_nodes: int - +# Be in the habit of storing message inputs and outputs in serializable structures. +# This makes it easier to add more over time in a backward-compatible way. @dataclass class ClusterManagerAllocateNNodesToJobInput: num_nodes: int @@ -56,6 +55,9 @@ class ClusterManagerAllocateNNodesToJobInput: class ClusterManagerDeleteJobInput: job_name: str +@dataclass +class ClusterManagerAllocateNNodesToJobResult: + nodes_assigned: Set[str] # ClusterManagerWorkflow keeps track of the allocations of a cluster of nodes. # Via signals, the cluster can be started and shutdown. @@ -88,7 +90,7 @@ async def shutdown_cluster(self) -> None: @workflow.update async def allocate_n_nodes_to_job( self, input: ClusterManagerAllocateNNodesToJobInput - ) -> List[str]: + ) -> ClusterManagerAllocateNNodesToJobResult: await workflow.wait_condition(lambda: self.state.cluster_started) if self.state.cluster_shutdown: # If you want the client to receive a failure, either add an update validator and throw the @@ -101,7 +103,8 @@ async def allocate_n_nodes_to_job( async with self.nodes_lock: # Idempotency guard. if input.job_name in self.state.jobs_added: - return self.get_assigned_nodes(job_name=input.job_name) + return ClusterManagerAllocateNNodesToJobResult( + self.get_assigned_nodes(job_name=input.job_name)) unassigned_nodes = self.get_unassigned_nodes() if len(unassigned_nodes) < input.num_nodes: # If you want the client to receive a failure, either add an update validator and throw the @@ -114,11 +117,8 @@ async def allocate_n_nodes_to_job( # This await would be dangerous without nodes_lock because it yields control and allows interleaving # with delete_job and perform_health_checks, which both touch self.state.nodes. await self._allocate_nodes_to_job(nodes_to_assign, input.job_name) - self.state.max_assigned_nodes = max( - self.state.max_assigned_nodes, - len(self.get_assigned_nodes()), - ) - return self.get_assigned_nodes(job_name=input.job_name) + return ClusterManagerAllocateNNodesToJobResult( + nodes_assigned=self.get_assigned_nodes(job_name=input.job_name)) async def _allocate_nodes_to_job( self, assigned_nodes: List[str], job_name: str @@ -163,16 +163,16 @@ async def _deallocate_nodes_for_job(self, nodes_to_free: List[str], job_name: st def get_unassigned_nodes(self) -> List[str]: return [k for k, v in self.state.nodes.items() if v is None] - def get_bad_nodes(self) -> List[str]: - return [k for k, v in self.state.nodes.items() if v == "BAD!"] + def get_bad_nodes(self) -> Set[str]: + return set([k for k, v in self.state.nodes.items() if v == "BAD!"]) - def get_assigned_nodes(self, *, job_name: Optional[str] = None) -> List[str]: + def get_assigned_nodes(self, *, job_name: Optional[str] = None) -> Set[str]: if job_name: - return [k for k, v in self.state.nodes.items() if v == job_name] + return set([k for k, v in self.state.nodes.items() if v == job_name]) else: - return [ + return set([ k for k, v in self.state.nodes.items() if v is not None and v != "BAD!" - ] + ]) async def perform_health_checks(self) -> None: async with self.nodes_lock: @@ -238,7 +238,6 @@ async def run(self, input: ClusterManagerInput) -> ClusterManagerResult: ) ) return ClusterManagerResult( - self.state.max_assigned_nodes, len(self.get_assigned_nodes()), len(self.get_bad_nodes()), ) From ce4d384288f848341a1f2fd5caf80ce95d729b3c Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Mon, 8 Jul 2024 12:41:52 -0700 Subject: [PATCH 24/28] Ruff linting --- .../safe_message_handlers/workflow_test.py | 10 ++++------ updates_and_signals/safe_message_handlers/workflow.py | 6 +----- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/updates_and_signals/safe_message_handlers/workflow_test.py b/tests/updates_and_signals/safe_message_handlers/workflow_test.py index 3a4e2e5a..be8883ce 100644 --- a/tests/updates_and_signals/safe_message_handlers/workflow_test.py +++ b/tests/updates_and_signals/safe_message_handlers/workflow_test.py @@ -1,7 +1,6 @@ import asyncio import uuid -from temporalio import common, workflow from temporalio.client import Client, WorkflowUpdateFailedError from temporalio.exceptions import ApplicationError from temporalio.worker import Worker @@ -11,7 +10,6 @@ deallocate_nodes_for_job, find_bad_nodes, ) -from updates_and_signals.safe_message_handlers.starter import do_cluster_lifecycle from updates_and_signals.safe_message_handlers.workflow import ( ClusterManagerAllocateNNodesToJobInput, ClusterManagerDeleteJobInput, @@ -87,12 +85,12 @@ async def test_update_idempotency(client: Client): result_1 = await cluster_manager_handle.execute_update( ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), + ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name="jobby-job"), ) # simulate that in calling it twice, the operation is idempotent result_2 = await cluster_manager_handle.execute_update( ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name=f"jobby-job"), + ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name="jobby-job"), ) # the second call should not allocate more nodes (it may return fewer if the health check finds bad nodes # in between the two signals.) @@ -118,14 +116,14 @@ async def test_update_failure(client: Client): await cluster_manager_handle.execute_update( ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=24, job_name=f"big-task"), + ClusterManagerAllocateNNodesToJobInput(num_nodes=24, job_name="big-task"), ) try: # Try to allocate too many nodes await cluster_manager_handle.execute_update( ClusterManagerWorkflow.allocate_n_nodes_to_job, ClusterManagerAllocateNNodesToJobInput( - num_nodes=3, job_name=f"little-task" + num_nodes=3, job_name="little-task" ), ) except WorkflowUpdateFailedError as e: diff --git a/updates_and_signals/safe_message_handlers/workflow.py b/updates_and_signals/safe_message_handlers/workflow.py index 1ec2e5a2..0b1b1b65 100644 --- a/updates_and_signals/safe_message_handlers/workflow.py +++ b/updates_and_signals/safe_message_handlers/workflow.py @@ -1,16 +1,12 @@ import asyncio import dataclasses -import logging -import uuid from dataclasses import dataclass from datetime import timedelta from typing import Dict, List, Optional, Set -from temporalio import activity, common, workflow -from temporalio.client import Client, WorkflowHandle +from temporalio import workflow from temporalio.common import RetryPolicy from temporalio.exceptions import ApplicationError -from temporalio.worker import Worker from updates_and_signals.safe_message_handlers.activities import ( AllocateNodesToJobInput, From 52429bd8d6020a9730d1519cff046e7a049ab4af Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Mon, 8 Jul 2024 15:28:38 -0700 Subject: [PATCH 25/28] Use consistent verbs, improve health check --- .../safe_message_handlers/workflow_test.py | 42 ++++----- .../safe_message_handlers/README.md | 2 +- .../safe_message_handlers/activities.py | 8 +- .../safe_message_handlers/starter.py | 10 +-- .../safe_message_handlers/worker.py | 9 +- .../safe_message_handlers/workflow.py | 90 ++++++++++--------- 6 files changed, 82 insertions(+), 79 deletions(-) diff --git a/tests/updates_and_signals/safe_message_handlers/workflow_test.py b/tests/updates_and_signals/safe_message_handlers/workflow_test.py index be8883ce..aed025d9 100644 --- a/tests/updates_and_signals/safe_message_handlers/workflow_test.py +++ b/tests/updates_and_signals/safe_message_handlers/workflow_test.py @@ -6,12 +6,12 @@ from temporalio.worker import Worker from updates_and_signals.safe_message_handlers.activities import ( - allocate_nodes_to_job, - deallocate_nodes_for_job, + assign_nodes_to_job, + unassign_nodes_for_job, find_bad_nodes, ) from updates_and_signals.safe_message_handlers.workflow import ( - ClusterManagerAllocateNNodesToJobInput, + ClusterManagerAssignNodesToJobInput, ClusterManagerDeleteJobInput, ClusterManagerInput, ClusterManagerWorkflow, @@ -24,7 +24,7 @@ async def test_safe_message_handlers(client: Client): client, task_queue=task_queue, workflows=[ClusterManagerWorkflow], - activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + activities=[assign_nodes_to_job, unassign_nodes_for_job, find_bad_nodes], ): cluster_manager_handle = await client.start_workflow( ClusterManagerWorkflow.run, @@ -38,9 +38,9 @@ async def test_safe_message_handlers(client: Client): for i in range(6): allocation_updates.append( cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput( - num_nodes=2, job_name=f"task-{i}" + ClusterManagerWorkflow.assign_nodes_to_job, + ClusterManagerAssignNodesToJobInput( + total_num_nodes=2, job_name=f"task-{i}" ), ) ) @@ -72,7 +72,7 @@ async def test_update_idempotency(client: Client): client, task_queue=task_queue, workflows=[ClusterManagerWorkflow], - activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + activities=[assign_nodes_to_job, unassign_nodes_for_job, find_bad_nodes], ): cluster_manager_handle = await client.start_workflow( ClusterManagerWorkflow.run, @@ -84,15 +84,15 @@ async def test_update_idempotency(client: Client): await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) result_1 = await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name="jobby-job"), + ClusterManagerWorkflow.assign_nodes_to_job, + ClusterManagerAssignNodesToJobInput(total_num_nodes=5, job_name="jobby-job"), ) # simulate that in calling it twice, the operation is idempotent result_2 = await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=5, job_name="jobby-job"), + ClusterManagerWorkflow.assign_nodes_to_job, + ClusterManagerAssignNodesToJobInput(total_num_nodes=5, job_name="jobby-job"), ) - # the second call should not allocate more nodes (it may return fewer if the health check finds bad nodes + # the second call should not assign more nodes (it may return fewer if the health check finds bad nodes # in between the two signals.) assert result_1.nodes_assigned >= result_2.nodes_assigned @@ -103,7 +103,7 @@ async def test_update_failure(client: Client): client, task_queue=task_queue, workflows=[ClusterManagerWorkflow], - activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + activities=[assign_nodes_to_job, unassign_nodes_for_job, find_bad_nodes], ): cluster_manager_handle = await client.start_workflow( ClusterManagerWorkflow.run, @@ -115,20 +115,20 @@ async def test_update_failure(client: Client): await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster) await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput(num_nodes=24, job_name="big-task"), + ClusterManagerWorkflow.assign_nodes_to_job, + ClusterManagerAssignNodesToJobInput(total_num_nodes=24, job_name="big-task"), ) try: - # Try to allocate too many nodes + # Try to assign too many nodes await cluster_manager_handle.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput( - num_nodes=3, job_name="little-task" + ClusterManagerWorkflow.assign_nodes_to_job, + ClusterManagerAssignNodesToJobInput( + total_num_nodes=3, job_name="little-task" ), ) except WorkflowUpdateFailedError as e: assert isinstance(e.cause, ApplicationError) - assert e.cause.message == "Cannot allocate 3 nodes; have only 1 available" + assert e.cause.message == "Cannot assign 3 nodes; have only 1 available" finally: await cluster_manager_handle.signal(ClusterManagerWorkflow.shutdown_cluster) result = await cluster_manager_handle.result() diff --git a/updates_and_signals/safe_message_handlers/README.md b/updates_and_signals/safe_message_handlers/README.md index 7dd71d1d..44bb9457 100644 --- a/updates_and_signals/safe_message_handlers/README.md +++ b/updates_and_signals/safe_message_handlers/README.md @@ -7,7 +7,7 @@ This sample shows off important techniques for handling signals and updates, aka * Message handlers can block and their actions can be interleaved with one another and with the main workflow. This can easily cause bugs, so we use a lock to protect shared state from interleaved access. * Message handlers should also finish before the workflow run completes. One option is to use a lock. * An "Entity" workflow, i.e. a long-lived workflow, periodically "continues as new". It must do this to prevent its history from growing too large, and it passes its state to the next workflow. You can check `workflow.info().is_continue_as_new_suggested()` to see when it's time. Just make sure message handlers have finished before doing so. -* Message handlers can be made idempotent. See update `ClusterManager.allocate_n_nodes_to_job`. +* Message handlers can be made idempotent. See update `ClusterManager.assign_nodes_to_job`. To run, first see [README.md](../../README.md) for prerequisites. diff --git a/updates_and_signals/safe_message_handlers/activities.py b/updates_and_signals/safe_message_handlers/activities.py index 5360dfef..3a1c9cd2 100644 --- a/updates_and_signals/safe_message_handlers/activities.py +++ b/updates_and_signals/safe_message_handlers/activities.py @@ -6,25 +6,25 @@ @dataclass -class AllocateNodesToJobInput: +class AssignNodesToJobInput: nodes: List[str] job_name: str @activity.defn -async def allocate_nodes_to_job(input: AllocateNodesToJobInput) -> None: +async def assign_nodes_to_job(input: AssignNodesToJobInput) -> None: print(f"Assigning nodes {input.nodes} to job {input.job_name}") await asyncio.sleep(0.1) @dataclass -class DeallocateNodesForJobInput: +class UnassignNodesForJobInput: nodes: List[str] job_name: str @activity.defn -async def deallocate_nodes_for_job(input: DeallocateNodesForJobInput) -> None: +async def unassign_nodes_for_job(input: UnassignNodesForJobInput) -> None: print(f"Deallocating nodes {input.nodes} from job {input.job_name}") await asyncio.sleep(0.1) diff --git a/updates_and_signals/safe_message_handlers/starter.py b/updates_and_signals/safe_message_handlers/starter.py index eed63e73..f318fd3b 100644 --- a/updates_and_signals/safe_message_handlers/starter.py +++ b/updates_and_signals/safe_message_handlers/starter.py @@ -4,11 +4,11 @@ import uuid from typing import Optional -from temporalio import client, common +from temporalio import common from temporalio.client import Client, WorkflowHandle from updates_and_signals.safe_message_handlers.workflow import ( - ClusterManagerAllocateNNodesToJobInput, + ClusterManagerAssignNodesToJobInput, ClusterManagerDeleteJobInput, ClusterManagerInput, ClusterManagerWorkflow, @@ -23,9 +23,9 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] for i in range(6): allocation_updates.append( wf.execute_update( - ClusterManagerWorkflow.allocate_n_nodes_to_job, - ClusterManagerAllocateNNodesToJobInput( - num_nodes=2, job_name=f"task-{i}" + ClusterManagerWorkflow.assign_nodes_to_job, + ClusterManagerAssignNodesToJobInput( + total_num_nodes=2, job_name=f"task-{i}" ), ) ) diff --git a/updates_and_signals/safe_message_handlers/worker.py b/updates_and_signals/safe_message_handlers/worker.py index 440a8756..487ed4cf 100644 --- a/updates_and_signals/safe_message_handlers/worker.py +++ b/updates_and_signals/safe_message_handlers/worker.py @@ -1,14 +1,13 @@ import asyncio import logging -from temporalio import activity, common, workflow -from temporalio.client import Client, WorkflowHandle +from temporalio.client import Client from temporalio.worker import Worker from updates_and_signals.safe_message_handlers.workflow import ( ClusterManagerWorkflow, - allocate_nodes_to_job, - deallocate_nodes_for_job, + assign_nodes_to_job, + unassign_nodes_for_job, find_bad_nodes, ) @@ -23,7 +22,7 @@ async def main(): client, task_queue="safe-message-handlers-task-queue", workflows=[ClusterManagerWorkflow], - activities=[allocate_nodes_to_job, deallocate_nodes_for_job, find_bad_nodes], + activities=[assign_nodes_to_job, unassign_nodes_for_job, find_bad_nodes], ): # Wait until interrupted logging.info("ClusterManagerWorkflow worker started, ctrl+c to exit") diff --git a/updates_and_signals/safe_message_handlers/workflow.py b/updates_and_signals/safe_message_handlers/workflow.py index 0b1b1b65..7e253e9b 100644 --- a/updates_and_signals/safe_message_handlers/workflow.py +++ b/updates_and_signals/safe_message_handlers/workflow.py @@ -9,11 +9,11 @@ from temporalio.exceptions import ApplicationError from updates_and_signals.safe_message_handlers.activities import ( - AllocateNodesToJobInput, - DeallocateNodesForJobInput, + AssignNodesToJobInput, + UnassignNodesForJobInput, FindBadNodesInput, - allocate_nodes_to_job, - deallocate_nodes_for_job, + assign_nodes_to_job, + unassign_nodes_for_job, find_bad_nodes, ) @@ -25,7 +25,7 @@ class ClusterManagerState: cluster_started: bool = False cluster_shutdown: bool = False nodes: Dict[str, Optional[str]] = dataclasses.field(default_factory=dict) - jobs_added: Set[str] = dataclasses.field(default_factory=set) + jobs_assigned: Set[str] = dataclasses.field(default_factory=set) @dataclass @@ -42,8 +42,9 @@ class ClusterManagerResult: # Be in the habit of storing message inputs and outputs in serializable structures. # This makes it easier to add more over time in a backward-compatible way. @dataclass -class ClusterManagerAllocateNNodesToJobInput: - num_nodes: int +class ClusterManagerAssignNodesToJobInput: + # If larger or smaller than previous amounts, will resize the job. + total_num_nodes: int job_name: str @@ -52,10 +53,10 @@ class ClusterManagerDeleteJobInput: job_name: str @dataclass -class ClusterManagerAllocateNNodesToJobResult: +class ClusterManagerAssignNodesToJobResult: nodes_assigned: Set[str] -# ClusterManagerWorkflow keeps track of the allocations of a cluster of nodes. +# ClusterManagerWorkflow keeps track of the assignments of a cluster of nodes. # Via signals, the cluster can be started and shutdown. # Via updates, clients can also assign jobs to nodes and delete jobs. # These updates must run atomically. @@ -84,52 +85,52 @@ async def shutdown_cluster(self) -> None: # before sending work to those nodes. # Returns the list of node names that were allocated to the job. @workflow.update - async def allocate_n_nodes_to_job( - self, input: ClusterManagerAllocateNNodesToJobInput - ) -> ClusterManagerAllocateNNodesToJobResult: + async def assign_nodes_to_job( + self, input: ClusterManagerAssignNodesToJobInput + ) -> ClusterManagerAssignNodesToJobResult: await workflow.wait_condition(lambda: self.state.cluster_started) if self.state.cluster_shutdown: # If you want the client to receive a failure, either add an update validator and throw the # exception from there, or raise an ApplicationError. Other exceptions in the main handler # will cause the workflow to keep retrying and get it stuck. raise ApplicationError( - "Cannot allocate nodes to a job: Cluster is already shut down" + "Cannot assign nodes to a job: Cluster is already shut down" ) async with self.nodes_lock: # Idempotency guard. - if input.job_name in self.state.jobs_added: - return ClusterManagerAllocateNNodesToJobResult( + if input.job_name in self.state.jobs_assigned: + return ClusterManagerAssignNodesToJobResult( self.get_assigned_nodes(job_name=input.job_name)) unassigned_nodes = self.get_unassigned_nodes() - if len(unassigned_nodes) < input.num_nodes: + if len(unassigned_nodes) < input.total_num_nodes: # If you want the client to receive a failure, either add an update validator and throw the # exception from there, or raise an ApplicationError. Other exceptions in the main handler # will cause the workflow to keep retrying and get it stuck. raise ApplicationError( - f"Cannot allocate {input.num_nodes} nodes; have only {len(unassigned_nodes)} available" + f"Cannot assign {input.total_num_nodes} nodes; have only {len(unassigned_nodes)} available" ) - nodes_to_assign = unassigned_nodes[: input.num_nodes] + nodes_to_assign = unassigned_nodes[: input.total_num_nodes] # This await would be dangerous without nodes_lock because it yields control and allows interleaving # with delete_job and perform_health_checks, which both touch self.state.nodes. - await self._allocate_nodes_to_job(nodes_to_assign, input.job_name) - return ClusterManagerAllocateNNodesToJobResult( + await self._assign_nodes_to_job(nodes_to_assign, input.job_name) + return ClusterManagerAssignNodesToJobResult( nodes_assigned=self.get_assigned_nodes(job_name=input.job_name)) - async def _allocate_nodes_to_job( + async def _assign_nodes_to_job( self, assigned_nodes: List[str], job_name: str ) -> None: await workflow.execute_activity( - allocate_nodes_to_job, - AllocateNodesToJobInput(nodes=assigned_nodes, job_name=job_name), + assign_nodes_to_job, + AssignNodesToJobInput(nodes=assigned_nodes, job_name=job_name), start_to_close_timeout=timedelta(seconds=10), ) for node in assigned_nodes: self.state.nodes[node] = job_name - self.state.jobs_added.add(job_name) + self.state.jobs_assigned.add(job_name) # Even though it returns nothing, this is an update because the client may want to track it, for example - # to wait for nodes to be deallocated before reassigning them. + # to wait for nodes to be unassignd before reassigning them. @workflow.update async def delete_job(self, input: ClusterManagerDeleteJobInput) -> None: await workflow.wait_condition(lambda: self.state.cluster_started) @@ -140,20 +141,20 @@ async def delete_job(self, input: ClusterManagerDeleteJobInput) -> None: raise ApplicationError("Cannot delete a job: Cluster is already shut down") async with self.nodes_lock: - nodes_to_free = [ + nodes_to_unassign = [ k for k, v in self.state.nodes.items() if v == input.job_name ] # This await would be dangerous without nodes_lock because it yields control and allows interleaving - # with allocate_n_nodes_to_job and perform_health_checks, which all touch self.state.nodes. - await self._deallocate_nodes_for_job(nodes_to_free, input.job_name) + # with assign_nodes_to_job and perform_health_checks, which all touch self.state.nodes. + await self._unassign_nodes_for_job(nodes_to_unassign, input.job_name) - async def _deallocate_nodes_for_job(self, nodes_to_free: List[str], job_name: str): + async def _unassign_nodes_for_job(self, nodes_to_unassign: List[str], job_name: str): await workflow.execute_activity( - deallocate_nodes_for_job, - DeallocateNodesForJobInput(nodes=nodes_to_free, job_name=job_name), + unassign_nodes_for_job, + UnassignNodesForJobInput(nodes=nodes_to_unassign, job_name=job_name), start_to_close_timeout=timedelta(seconds=10), ) - for node in nodes_to_free: + for node in nodes_to_unassign: self.state.nodes[node] = None def get_unassigned_nodes(self) -> List[str]: @@ -173,17 +174,20 @@ def get_assigned_nodes(self, *, job_name: Optional[str] = None) -> Set[str]: async def perform_health_checks(self) -> None: async with self.nodes_lock: assigned_nodes = self.get_assigned_nodes() - # This await would be dangerous without nodes_lock because it yields control and allows interleaving - # with allocate_n_nodes_to_job and delete_job, which both touch self.state.nodes. - bad_nodes = await workflow.execute_activity( - find_bad_nodes, - FindBadNodesInput(nodes_to_check=assigned_nodes), - start_to_close_timeout=timedelta(seconds=10), - # This health check is optional, and our lock would block the whole workflow if we let it retry forever. - retry_policy=RetryPolicy(maximum_attempts=1), - ) - for node in bad_nodes: - self.state.nodes[node] = "BAD!" + try: + # This await would be dangerous without nodes_lock because it yields control and allows interleaving + # with assign_nodes_to_job and delete_job, which both touch self.state.nodes. + bad_nodes = await workflow.execute_activity( + find_bad_nodes, + FindBadNodesInput(nodes_to_check=assigned_nodes), + start_to_close_timeout=timedelta(seconds=10), + # This health check is optional, and our lock would block the whole workflow if we let it retry forever. + retry_policy=RetryPolicy(maximum_attempts=1), + ) + for node in bad_nodes: + self.state.nodes[node] = "BAD!" + except Exception as e: + workflow.logger.warn(f"Health check failed with error {type(e).__name__}:{e}") # The cluster manager is a long-running "entity" workflow so we need to periodically checkpoint its state and # continue-as-new. From 74867f16c224defa2450f5de89a1f848a2518a57 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Mon, 8 Jul 2024 15:59:42 -0700 Subject: [PATCH 26/28] poe format --- .../safe_message_handlers/workflow_test.py | 14 +++++--- .../safe_message_handlers/worker.py | 2 +- .../safe_message_handlers/workflow.py | 35 +++++++++++++------ 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/tests/updates_and_signals/safe_message_handlers/workflow_test.py b/tests/updates_and_signals/safe_message_handlers/workflow_test.py index aed025d9..93a9ca5e 100644 --- a/tests/updates_and_signals/safe_message_handlers/workflow_test.py +++ b/tests/updates_and_signals/safe_message_handlers/workflow_test.py @@ -7,8 +7,8 @@ from updates_and_signals.safe_message_handlers.activities import ( assign_nodes_to_job, - unassign_nodes_for_job, find_bad_nodes, + unassign_nodes_for_job, ) from updates_and_signals.safe_message_handlers.workflow import ( ClusterManagerAssignNodesToJobInput, @@ -85,12 +85,16 @@ async def test_update_idempotency(client: Client): result_1 = await cluster_manager_handle.execute_update( ClusterManagerWorkflow.assign_nodes_to_job, - ClusterManagerAssignNodesToJobInput(total_num_nodes=5, job_name="jobby-job"), + ClusterManagerAssignNodesToJobInput( + total_num_nodes=5, job_name="jobby-job" + ), ) # simulate that in calling it twice, the operation is idempotent result_2 = await cluster_manager_handle.execute_update( ClusterManagerWorkflow.assign_nodes_to_job, - ClusterManagerAssignNodesToJobInput(total_num_nodes=5, job_name="jobby-job"), + ClusterManagerAssignNodesToJobInput( + total_num_nodes=5, job_name="jobby-job" + ), ) # the second call should not assign more nodes (it may return fewer if the health check finds bad nodes # in between the two signals.) @@ -116,7 +120,9 @@ async def test_update_failure(client: Client): await cluster_manager_handle.execute_update( ClusterManagerWorkflow.assign_nodes_to_job, - ClusterManagerAssignNodesToJobInput(total_num_nodes=24, job_name="big-task"), + ClusterManagerAssignNodesToJobInput( + total_num_nodes=24, job_name="big-task" + ), ) try: # Try to assign too many nodes diff --git a/updates_and_signals/safe_message_handlers/worker.py b/updates_and_signals/safe_message_handlers/worker.py index 487ed4cf..5e28eca3 100644 --- a/updates_and_signals/safe_message_handlers/worker.py +++ b/updates_and_signals/safe_message_handlers/worker.py @@ -7,8 +7,8 @@ from updates_and_signals.safe_message_handlers.workflow import ( ClusterManagerWorkflow, assign_nodes_to_job, - unassign_nodes_for_job, find_bad_nodes, + unassign_nodes_for_job, ) interrupt_event = asyncio.Event() diff --git a/updates_and_signals/safe_message_handlers/workflow.py b/updates_and_signals/safe_message_handlers/workflow.py index 7e253e9b..b8230578 100644 --- a/updates_and_signals/safe_message_handlers/workflow.py +++ b/updates_and_signals/safe_message_handlers/workflow.py @@ -10,11 +10,11 @@ from updates_and_signals.safe_message_handlers.activities import ( AssignNodesToJobInput, - UnassignNodesForJobInput, FindBadNodesInput, + UnassignNodesForJobInput, assign_nodes_to_job, - unassign_nodes_for_job, find_bad_nodes, + unassign_nodes_for_job, ) @@ -39,7 +39,8 @@ class ClusterManagerResult: num_currently_assigned_nodes: int num_bad_nodes: int -# Be in the habit of storing message inputs and outputs in serializable structures. + +# Be in the habit of storing message inputs and outputs in serializable structures. # This makes it easier to add more over time in a backward-compatible way. @dataclass class ClusterManagerAssignNodesToJobInput: @@ -52,10 +53,12 @@ class ClusterManagerAssignNodesToJobInput: class ClusterManagerDeleteJobInput: job_name: str -@dataclass + +@dataclass class ClusterManagerAssignNodesToJobResult: nodes_assigned: Set[str] + # ClusterManagerWorkflow keeps track of the assignments of a cluster of nodes. # Via signals, the cluster can be started and shutdown. # Via updates, clients can also assign jobs to nodes and delete jobs. @@ -101,7 +104,8 @@ async def assign_nodes_to_job( # Idempotency guard. if input.job_name in self.state.jobs_assigned: return ClusterManagerAssignNodesToJobResult( - self.get_assigned_nodes(job_name=input.job_name)) + self.get_assigned_nodes(job_name=input.job_name) + ) unassigned_nodes = self.get_unassigned_nodes() if len(unassigned_nodes) < input.total_num_nodes: # If you want the client to receive a failure, either add an update validator and throw the @@ -115,7 +119,8 @@ async def assign_nodes_to_job( # with delete_job and perform_health_checks, which both touch self.state.nodes. await self._assign_nodes_to_job(nodes_to_assign, input.job_name) return ClusterManagerAssignNodesToJobResult( - nodes_assigned=self.get_assigned_nodes(job_name=input.job_name)) + nodes_assigned=self.get_assigned_nodes(job_name=input.job_name) + ) async def _assign_nodes_to_job( self, assigned_nodes: List[str], job_name: str @@ -148,7 +153,9 @@ async def delete_job(self, input: ClusterManagerDeleteJobInput) -> None: # with assign_nodes_to_job and perform_health_checks, which all touch self.state.nodes. await self._unassign_nodes_for_job(nodes_to_unassign, input.job_name) - async def _unassign_nodes_for_job(self, nodes_to_unassign: List[str], job_name: str): + async def _unassign_nodes_for_job( + self, nodes_to_unassign: List[str], job_name: str + ): await workflow.execute_activity( unassign_nodes_for_job, UnassignNodesForJobInput(nodes=nodes_to_unassign, job_name=job_name), @@ -167,9 +174,13 @@ def get_assigned_nodes(self, *, job_name: Optional[str] = None) -> Set[str]: if job_name: return set([k for k, v in self.state.nodes.items() if v == job_name]) else: - return set([ - k for k, v in self.state.nodes.items() if v is not None and v != "BAD!" - ]) + return set( + [ + k + for k, v in self.state.nodes.items() + if v is not None and v != "BAD!" + ] + ) async def perform_health_checks(self) -> None: async with self.nodes_lock: @@ -187,7 +198,9 @@ async def perform_health_checks(self) -> None: for node in bad_nodes: self.state.nodes[node] = "BAD!" except Exception as e: - workflow.logger.warn(f"Health check failed with error {type(e).__name__}:{e}") + workflow.logger.warn( + f"Health check failed with error {type(e).__name__}:{e}" + ) # The cluster manager is a long-running "entity" workflow so we need to periodically checkpoint its state and # continue-as-new. From c6bdd128a7afc268cbe0347579aec5d4d5506a37 Mon Sep 17 00:00:00 2001 From: Drew Hoskins Date: Mon, 8 Jul 2024 16:06:06 -0700 Subject: [PATCH 27/28] Minor sample improvements --- updates_and_signals/safe_message_handlers/starter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/updates_and_signals/safe_message_handlers/starter.py b/updates_and_signals/safe_message_handlers/starter.py index f318fd3b..adb56a66 100644 --- a/updates_and_signals/safe_message_handlers/starter.py +++ b/updates_and_signals/safe_message_handlers/starter.py @@ -19,6 +19,7 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] await wf.signal(ClusterManagerWorkflow.start_cluster) + print("Assigning jobs to nodes...") allocation_updates = [] for i in range(6): allocation_updates.append( @@ -31,9 +32,11 @@ async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] ) await asyncio.gather(*allocation_updates) + print(f"Sleeping for {delay_seconds} second(s)") if delay_seconds: await asyncio.sleep(delay_seconds) + print("Deleting jobs...") deletion_updates = [] for i in range(6): deletion_updates.append( @@ -51,6 +54,7 @@ async def main(should_test_continue_as_new: bool): # Connect to Temporal client = await Client.connect("localhost:7233") + print("Starting cluster") cluster_manager_handle = await client.start_workflow( ClusterManagerWorkflow.run, ClusterManagerInput(test_continue_as_new=should_test_continue_as_new), @@ -62,7 +66,7 @@ async def main(should_test_continue_as_new: bool): await do_cluster_lifecycle(cluster_manager_handle, delay_seconds=delay_seconds) result = await cluster_manager_handle.result() print( - f"Cluster shut down successfully. It peaked at {result.max_assigned_nodes} assigned nodes ." + f"Cluster shut down successfully." f" It had {result.num_currently_assigned_nodes} nodes assigned at the end." ) From 62f24a23d4fd28e2ed226f0c5d8ca2d50fef8559 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Mon, 22 Jul 2024 10:31:48 -0400 Subject: [PATCH 28/28] Skip update tests under Java test server --- .../safe_message_handlers/workflow_test.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/updates_and_signals/safe_message_handlers/workflow_test.py b/tests/updates_and_signals/safe_message_handlers/workflow_test.py index 93a9ca5e..852419ef 100644 --- a/tests/updates_and_signals/safe_message_handlers/workflow_test.py +++ b/tests/updates_and_signals/safe_message_handlers/workflow_test.py @@ -1,8 +1,10 @@ import asyncio import uuid +import pytest from temporalio.client import Client, WorkflowUpdateFailedError from temporalio.exceptions import ApplicationError +from temporalio.testing import WorkflowEnvironment from temporalio.worker import Worker from updates_and_signals.safe_message_handlers.activities import ( @@ -18,7 +20,11 @@ ) -async def test_safe_message_handlers(client: Client): +async def test_safe_message_handlers(client: Client, env: WorkflowEnvironment): + if env.supports_time_skipping: + pytest.skip( + "Java test server: https://github.com/temporalio/sdk-java/issues/1903" + ) task_queue = f"tq-{uuid.uuid4()}" async with Worker( client, @@ -66,7 +72,11 @@ async def test_safe_message_handlers(client: Client): assert result.num_currently_assigned_nodes == 0 -async def test_update_idempotency(client: Client): +async def test_update_idempotency(client: Client, env: WorkflowEnvironment): + if env.supports_time_skipping: + pytest.skip( + "Java test server: https://github.com/temporalio/sdk-java/issues/1903" + ) task_queue = f"tq-{uuid.uuid4()}" async with Worker( client, @@ -101,7 +111,11 @@ async def test_update_idempotency(client: Client): assert result_1.nodes_assigned >= result_2.nodes_assigned -async def test_update_failure(client: Client): +async def test_update_failure(client: Client, env: WorkflowEnvironment): + if env.supports_time_skipping: + pytest.skip( + "Java test server: https://github.com/temporalio/sdk-java/issues/1903" + ) task_queue = f"tq-{uuid.uuid4()}" async with Worker( client,