Skip to content

Commit 219007d

Browse files
authored
fix(core/saver): create checkpoint_dir in each rank when initialize_checkpoint (#63)
Fix the error: ``` [MLF 2026-02-26 19:09:24,334 ERROR Step=110 Rank=5 ml_flashpoint.adapter.megatron.save_utils:78] Failed to save ML Flashpoint checkpoint. Skipping saving and continuing. Traceback (most recent call last): File "/tmp/ml-flashpoint/src/ml_flashpoint/adapter/megatron/save_utils.py", line 67, in save_local_aware_megatron_checkpoint return save_strategy.async_save( ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/contextlib.py", line 81, in inner return func(*args, **kwds) ^^^^^^^^^^^^^^^^^^^ File "/tmp/ml-flashpoint/src/ml_flashpoint/adapter/megatron/save_strategies.py", line 220, in async_save with open(os.path.join(checkpoint_dir, "metadata.json"), "w") as f: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ FileNotFoundError: [Errno 2] No such file or directory: '/tmp/logs/ml-flashpoint/job-plan/step-110_ckpt/metadata.json' ```
1 parent e6c2a59 commit 219007d

File tree

2 files changed

+8
-15
lines changed

2 files changed

+8
-15
lines changed

src/ml_flashpoint/core/checkpoint_saver.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -339,9 +339,8 @@ def __setstate__(self, state):
339339
@log_execution_time(logger=_LOGGER, name="initialize_checkpoint")
340340
def initialize_checkpoint(self, checkpoint_id: CheckpointContainerId) -> None:
341341
self._create_dirty_checkpoint_marker(checkpoint_id)
342-
if self._local_rank_getter() == 0:
343-
os.makedirs(checkpoint_id.data, exist_ok=True)
344-
_LOGGER.info("Created checkpoint directory: '%s'", checkpoint_id.data)
342+
os.makedirs(checkpoint_id.data, exist_ok=True)
343+
_LOGGER.info("Created checkpoint directory: '%s'", checkpoint_id.data)
345344

346345
@override
347346
@log_execution_time(logger=_LOGGER, name="stage_data", level=logging.INFO)

tests/core/test_checkpoint_saver.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ def mock_open(file, mode):
406406
assert not os.path.exists(checkpoint_id.data)
407407

408408
@pytest.mark.parametrize("local_rank", [0, 1, 8])
409-
def test_initialize_checkpoint_creates_container_dir_when_not_exists_local_rank_0(
409+
def test_initialize_checkpoint_creates_container_dir_when_not_exists(
410410
self,
411411
local_rank,
412412
temp_dir_path,
@@ -427,12 +427,9 @@ def test_initialize_checkpoint_creates_container_dir_when_not_exists_local_rank_
427427
saver.initialize_checkpoint(checkpoint_id)
428428

429429
# Then
430-
# Check for directory creation only on local rank 0
431-
if local_rank == 0:
432-
assert os.path.exists(checkpoint_id.data)
433-
assert os.path.isdir(checkpoint_id.data)
434-
else:
435-
assert not os.path.exists(checkpoint_id.data)
430+
# Check for directory creation on all ranks
431+
assert os.path.exists(checkpoint_id.data)
432+
assert os.path.isdir(checkpoint_id.data)
436433

437434
@pytest.mark.parametrize("local_rank", [0, 1])
438435
def test_initialize_checkpoint_leaves_container_dir_when_exists(
@@ -494,11 +491,8 @@ def test_initialize_checkpoint_idempotent(
494491

495492
# Then
496493
assert os.path.exists(expected_dirty_marker_path)
497-
if local_rank == 0:
498-
assert os.path.exists(checkpoint_id.data)
499-
assert os.path.isdir(checkpoint_id.data)
500-
else:
501-
assert not os.path.exists(checkpoint_id.data)
494+
assert os.path.exists(checkpoint_id.data)
495+
assert os.path.isdir(checkpoint_id.data)
502496

503497
def test_initialize_checkpoint_ensures_parent_dir_exists(
504498
self,

0 commit comments

Comments
 (0)