File tree Expand file tree Collapse file tree 1 file changed +6
-1
lines changed
Expand file tree Collapse file tree 1 file changed +6
-1
lines changed Original file line number Diff line number Diff line change @@ -188,9 +188,14 @@ def train_func(config):
188188 if config ["evaluate" ]:
189189 test_history = multi_worker_model .evaluate (eval_tf_dataset , callbacks = callbacks )
190190 results .append (test_history )
191+
192+ # Only save checkpoint from the chief worker to avoid race conditions.
193+ # However, we need to call save on all workers to avoid deadlock.
191194 with tempfile .TemporaryDirectory () as temp_checkpoint_dir :
192195 multi_worker_model .save (temp_checkpoint_dir , save_format = "tf" )
193- checkpoint = Checkpoint .from_directory (temp_checkpoint_dir )
196+ checkpoint = None
197+ if session .get_world_rank () == 0 :
198+ checkpoint = Checkpoint .from_directory (temp_checkpoint_dir )
194199
195200 session .report ({}, checkpoint = checkpoint )
196201
You can’t perform that action at this time.
0 commit comments