Skip to content

Commit 89e8260

Browse files
committed
fix: propagate remote exception traceback to parameter server
1 parent 279a908 commit 89e8260

File tree

2 files changed

+11
-4
lines changed

2 files changed

+11
-4
lines changed

checkpoint_engine/ps.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,9 +1284,9 @@ def _update_per_bucket(
12841284
dist.broadcast(buffer_b, src=brank)
12851285
resp = socket.recv()
12861286
if resp != b"":
1287-
exception_obj = pickle.loads(resp)
1287+
msg = pickle.loads(resp)
12881288
logger.error(
1289-
f"[rank{self._rank}] receive error response '{type(exception_obj).__name__}: {exception_obj}' from rank {receiver_rank} for bucket {gidx} in checkpoint {checkpoint_name}"
1289+
f"[rank{self._rank}] receive error response from rank {receiver_rank} for bucket {gidx} in checkpoint {checkpoint_name}: {msg}"
12901290
)
12911291
ret_code.fill_(1)
12921292
dist.all_reduce(ret_code, op=dist.ReduceOp.SUM)

checkpoint_engine/worker.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import torch
66
import zmq
7+
import traceback
78

89
from checkpoint_engine.device_utils import DeviceManager, npu_generate_uuid
910

@@ -63,7 +64,10 @@ def update_weights_from_ipc(
6364
assert buffer.dtype == torch.uint8
6465
socket.send(b"")
6566
except Exception as e:
66-
socket.send_pyobj(e)
67+
msg = "".join(
68+
traceback.format_exception(type(e), e, e.__traceback__)
69+
)
70+
socket.send_pyobj(msg)
6771
socket.recv() # wait for ack
6872
raise
6973
try:
@@ -83,7 +87,10 @@ def update_weights_from_ipc(
8387
except Exception as e: # noqa: BLE001
8488
# Send exception back to Parameter Server.
8589
# Don't raise here. Because all workers should quit in the same way by receiving the exception from PS
86-
socket.send_pyobj(e)
90+
msg = "".join(
91+
traceback.format_exception(type(e), e, e.__traceback__)
92+
)
93+
socket.send_pyobj(msg)
8794
elif isinstance(
8895
payload, Exception
8996
): # error occurred, got force quit signal from Parameter Server

0 commit comments

Comments
 (0)