Skip to content

Commit 9267b23

Browse files
committed
fix: propagate remote exception traceback to parameter server
1 parent 279a908 commit 9267b23

File tree

2 files changed

+7
-4
lines changed

2 files changed

+7
-4
lines changed

checkpoint_engine/ps.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,9 +1284,9 @@ def _update_per_bucket(
12841284
dist.broadcast(buffer_b, src=brank)
12851285
resp = socket.recv()
12861286
if resp != b"":
1287-
exception_obj = pickle.loads(resp)
1287+
msg = resp.decode("utf-8")
12881288
logger.error(
1289-
f"[rank{self._rank}] receive error response '{type(exception_obj).__name__}: {exception_obj}' from rank {receiver_rank} for bucket {gidx} in checkpoint {checkpoint_name}"
1289+
f"[rank{self._rank}] receive error response from rank {receiver_rank} for bucket {gidx} in checkpoint {checkpoint_name}: {msg}"
12901290
)
12911291
ret_code.fill_(1)
12921292
dist.all_reduce(ret_code, op=dist.ReduceOp.SUM)

checkpoint_engine/worker.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import gc
2+
import traceback
23
from collections.abc import Callable
34
from typing import TypedDict
45

@@ -63,7 +64,8 @@ def update_weights_from_ipc(
6364
assert buffer.dtype == torch.uint8
6465
socket.send(b"")
6566
except Exception as e:
66-
socket.send_pyobj(e)
67+
msg = "".join(traceback.format_exception(type(e), e, e.__traceback__))
68+
socket.send_string(msg)
6769
socket.recv() # wait for ack
6870
raise
6971
try:
@@ -83,7 +85,8 @@ def update_weights_from_ipc(
8385
except Exception as e: # noqa: BLE001
8486
# Send exception back to Parameter Server.
8587
# Don't raise here. Because all workers should quit in the same way by receiving the exception from PS
86-
socket.send_pyobj(e)
88+
msg = "".join(traceback.format_exception(type(e), e, e.__traceback__))
89+
socket.send_string(msg)
8790
elif isinstance(
8891
payload, Exception
8992
): # error occurred, got force quit signal from Parameter Server

0 commit comments

Comments
 (0)