From b7680e30f19a2c862bb50c27329611b132a9b73d Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Mon, 5 May 2025 13:05:03 -0700 Subject: [PATCH] (torchx/local_scheduler) Use os.kill instead of os.killpg when sending SIGTERM to the replica pid. Add runner.wait() for torchx.runner.test.api_test#test_empty_session_id to gracefully wait for the replicas to finish running Summary: Fixes macos unittest failures: https://github.com/pytorch/torchx/actions/runs/14844253481/job/41674243877 When looking into the test failure I noticed two things: 1. `local_scheduler` was trying to SIGTERM the process group by passing the replica's pid: `os.killpg(replica.pid, signal.SIGTERM)` . Changed to call `os.kill`. (note that `os.killpg` is not available on iOS which is why the test was failing). 2. The `torchx.runner.test.api_test.test_empty_session_id()` test case doesn't wait for the `echo` test command to finish hence there was a race condition where in certain cases the runner's `__exit__()` SIGTERMs the replica pids but since the `local_scheduler` was (wronfully) using `os.killpg` not `os.kill` it threw an uncaught error in iOS. Differential Revision: D74197282 --- torchx/runner/test/api_test.py | 1 + torchx/schedulers/local_scheduler.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/torchx/runner/test/api_test.py b/torchx/runner/test/api_test.py index 49b9335ab..616987b04 100644 --- a/torchx/runner/test/api_test.py +++ b/torchx/runner/test/api_test.py @@ -153,6 +153,7 @@ def test_empty_session_id(self, _: MagicMock) -> None: ) app_handle = runner.run(app, "local", self.cfg) + runner.wait(app_handle, wait_interval=0.1) scheduler, session_name, app_id = parse_app_handle(app_handle) self.assertEqual(scheduler, "local") diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py index 9250ee72a..aa5f63bf5 100644 --- a/torchx/schedulers/local_scheduler.py +++ b/torchx/schedulers/local_scheduler.py @@ -311,7 +311,7 @@ def terminate(self) -> None: """ # safe to call terminate on a process that already died try: - os.killpg(self.proc.pid, signal.SIGTERM) + os.kill(self.proc.pid, signal.SIGTERM) except ProcessLookupError as e: log.debug(f"Process {self.proc.pid} already got terminated")