google
diff --git a/‎.github/workflows/build-and-test.yml‎
Lines changed: 24 additions & 6 deletions b/‎.github/workflows/build-and-test.yml‎
Lines changed: 24 additions & 6 deletions
diff --git a/‎.github/workflows/post-coverage-comment.yml‎
Lines changed: 84 additions & 0 deletions b/‎.github/workflows/post-coverage-comment.yml‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎docs/README.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/user-guide.md‎
Lines changed: 14 additions & 2 deletions b/‎docs/user-guide.md‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 27 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎src/ml_flashpoint/adapter/megatron/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎src/ml_flashpoint/adapter/megatron/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/ml_flashpoint/adapter/megatron/save_strategies.py‎
Lines changed: 33 additions & 14 deletions b/‎src/ml_flashpoint/adapter/megatron/save_strategies.py‎
Lines changed: 33 additions & 14 deletions
@@ -32,7 +32,6 @@ jobs:
       CPP_FAIL_UNDER: 80
     permissions:
       contents: read # Required for actions/checkout
-      pull-requests: write # Required to post the comment
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # ratchet:actions/checkout@v6
 
@@ -65,12 +64,14 @@ jobs:
       - name: Check Python test coverage
         run: |
           # Verify python coverage thresholds
+          echo -e "\n##### Generating Python coverage XML #####"
+          coverage xml -o python-coverage.xml
           echo -e "\n##### Verifying Python coverage thresholds #####"
           coverage report --fail-under=${{ env.PYTHON_FAIL_UNDER }}
-          coverage xml -o python-coverage.xml
 
       - name: Python Coverage Summary
         uses: irongut/CodeCoverageSummary@51cc3a756ddcd398d447c044c02cb6aa83fdae95 # ratchet:irongut/CodeCoverageSummary@v1.3.0
+        if: always() # Run even if threshold check above fails
         with:
           filename: python-coverage.xml
           badge: true
@@ -83,12 +84,16 @@ jobs:
           thresholds: '${{ env.PYTHON_FAIL_UNDER }} 95'
 
       - name: Add Python Coverage Title
+        if: always()
         run: |
-          echo '### Python Code Coverage Summary' | cat - code-coverage-results.md > temp && mv temp python-code-coverage-results.md
+          # Only run if the summary was actually generated
+          if [ -f code-coverage-results.md ]; then
+            echo '### Python Code Coverage Summary' | cat - code-coverage-results.md > temp && mv temp python-code-coverage-results.md
+          fi
 
       - name: Add Python Coverage PR Comment
         uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # ratchet:marocchino/sticky-pull-request-comment@v2
-        if: github.event_name == 'pull_request'
+        if: false # TODO remove once new workflow confirmed to work
         with:
           recreate: true
           path: python-code-coverage-results.md
@@ -116,6 +121,7 @@ jobs:
 
       - name: C++ Coverage Summary
         uses: irongut/CodeCoverageSummary@51cc3a756ddcd398d447c044c02cb6aa83fdae95 # ratchet:irongut/CodeCoverageSummary@v1.3.0
+        if: always() # Run even if threshold check above fails
         with:
           filename: cxx-coverage.xml
           badge: true
@@ -128,27 +134,39 @@ jobs:
           thresholds: '${{ env.CPP_FAIL_UNDER }} 40'
 
       - name: Add C++ Coverage Title
+        if: always()
         run: |
-          echo '### C++ Code Coverage Summary' | cat - code-coverage-results.md > temp && mv temp cpp-code-coverage-results.md
+          if [ -f code-coverage-results.md ]; then
+            echo '### C++ Code Coverage Summary' | cat - code-coverage-results.md > temp && mv temp cpp-code-coverage-results.md
+          fi
 
       - name: Add C++ Coverage PR Comment
         uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # ratchet:marocchino/sticky-pull-request-comment@v2
-        if: github.event_name == 'pull_request'
+        if: false # TODO: remove when new workflow confirmed to work
         with:
           header: cpp-coverage
           recreate: true
           path: cpp-code-coverage-results.md
 
+      - name: Save PR number
+        # Use always() so this runs even if previous coverage/test steps failed.
+        if: always() && github.event_name == 'pull_request'
+        run: |
+          echo ${{ github.event.number }} > pr_number.txt
+
       - name: Archive coverage reports
         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # ratchet:actions/upload-artifact@v4
+        if: always()
         with:
           name: coverage-reports
+          if-no-files-found: warn # Default, but setting explicitly for awareness as non-PRs won't have pr_number.txt
           path: |
             htmlcov/
             python-coverage.xml
             cxx-coverage.xml
             python-code-coverage-results.md
             cpp-code-coverage-results.md
+            pr_number.txt
 
   lint-code:
     runs-on: ubuntu-latest
 
@@ -0,0 +1,84 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Post Coverage Comment
+
+on:
+  workflow_run:
+    workflows: ["Build and Test"]
+    types:
+      - completed
+
+jobs:
+  post-comment:
+    runs-on: ubuntu-latest
+    # This workflow runs in the context of the base repository, so it has write permissions
+    # even when the triggering workflow was from a fork.
+    # We run even if the build or thresholds failed, so long as it wasn't cancelled.
+    if: >
+      github.event.workflow_run.event == 'pull_request' &&
+      github.event.workflow_run.conclusion != 'cancelled'
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Download coverage reports artifact
+        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # ratchet:actions/download-artifact@v7
+        continue-on-error: true # Artifact might be missing on very early failures
+        with:
+          name: coverage-reports
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.event.workflow_run.id }}
+
+      - name: Check for coverage files
+        # We use an explicit step to check for file existence and set outputs.
+        # This is more robust than using hashFiles() in an 'if' expression, 
+        # as hashFiles() is primarily intended for cache keys and lacks 
+        # a dedicated file_exists() equivalent in GitHub Actions expressions.
+        id: check_files
+        run: |
+          if [ -f pr_number.txt ]; then
+            echo "pr_number=$(cat pr_number.txt)" >> $GITHUB_OUTPUT
+            echo "pr_found=true" >> $GITHUB_OUTPUT
+          else
+            echo "pr_found=false" >> $GITHUB_OUTPUT
+          fi
+
+          if [ -f python-code-coverage-results.md ]; then
+            echo "python_found=true" >> $GITHUB_OUTPUT
+          else
+            echo "python_found=false" >> $GITHUB_OUTPUT
+          fi
+
+          if [ -f cpp-code-coverage-results.md ]; then
+            echo "cpp_found=true" >> $GITHUB_OUTPUT
+          else
+            echo "cpp_found=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Post Python Coverage PR Comment
+        if: steps.check_files.outputs.pr_found == 'true' && steps.check_files.outputs.python_found == 'true'
+        uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # ratchet:marocchino/sticky-pull-request-comment@v2
+        with:
+          recreate: true
+          number: ${{ steps.check_files.outputs.pr_number }}
+          path: python-code-coverage-results.md
+
+      - name: Post C++ Coverage PR Comment
+        if: steps.check_files.outputs.pr_found == 'true' && steps.check_files.outputs.cpp_found == 'true'
+        uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # ratchet:marocchino/sticky-pull-request-comment@v2
+        with:
+          header: cpp-coverage
+          recreate: true
+          number: ${{ steps.check_files.outputs.pr_number }}
+          path: cpp-code-coverage-results.md
@@ -36,15 +36,16 @@ When comparing
 
 We observe:
 
-* Data write times that are up to 20-30x faster for ML Flashpoint specifically, with little to no optimization.
-This is expected to further improve with additional optimizations.
-* Total checkpoint recovery times that are ~7-10x faster for ML Flashpoint specifically (includes the time it takes to do checkpoint detection, cross-node coordination, replication, read into model state and be ready to resume training).
+* Data write times that are up to 120x faster for ML Flashpoint specifically, currently reaching up to ~30 GB/s/node write throughput (scales linearly with cluster size).
+* Total checkpoint recovery times that are ~7-12x faster for ML Flashpoint specifically, depending on number of nodes lost (includes the time it takes to do checkpoint detection, cross-node coordination, replication, read into model state and be ready to resume training).
 * For _async_ checkpointing: 
     * Improvements averaging **3%** (Gemma 27B) & **6%** (Llama 70B) for _overall job time_ in the hybrid approach.
     * Improvements reach **5%** (Gemma 27B) & **10%** (Llama 70B) when NeMo checkpointing is deferred to the end (300th step) instead of being done every 50 steps. 
     * These improvements only account for checkpoint _save_ efficiency, representing a "lower bound" value as it doesn't account for the speedups in _recovery_ time.
     * Any job interruptions would also benefit from ML Flashpoint's recovery performance gains.
 
+Stay tuned and watch the [repository](https://github.com/google/ml-flashpoint) for updates on future improvements!
+
 !!! info
 
     While [ML runtime goodput](https://cloud.google.com/blog/products/ai-machine-learning/goodput-metric-as-measure-of-ml-productivity) is important, we focus on overall job time as an end-to-end metric, as it is simpler and allows for straightforward _total_ cost comparisons.
@@ -69,8 +70,7 @@ To use ML Flashpoint, the basic requirements for the training environment are:
     * This is enforced so that the pairwise strategy doesn't put a higher memory burden on one node than the others, and so the general capacity requirements are roughly consistent across nodes.
 1. A `tmpfs` mount is strongly recommended to be used for the container base path, that is separate from `/dev/shm`.
 E.g. a `/tmp` mount, which can be added to `/etc/fstab` on Linux machines to mount it persistently (A3-Mega example):
-    1. `tmpfs         /tmp            tmpfs           rw,nosuid,nodev,size=1024G,mode=1777,noswap,huge=within_size   0 0`
-    1. `huge=within_size` is recommended to use huge pages for any files large enough, since checkpoint data is on the order of many GBs.
+    1. `tmpfs         /tmp            tmpfs           rw,nosuid,nodev,size=1024G,mode=1777,noswap   0 0`
     1. `noswap` is recommended to avoid degrading performance.
    This can be omitted if you prefer to allow transparent disk swapping to accommodate more checkpoint storage than can fit in memory, at the cost of poorer checkpointing performance.
     1. The amount of memory needed is at least equal to the checkpoint size per node x 4, to account for replicas and in-progress checkpoints. 
 
@@ -133,6 +133,7 @@ from ml_flashpoint.replication.replication_manager import ReplicationManager
 
 # Megatron Checkpointing
 from megatron.core import dist_checkpointing as mcore_dist_checkpointing
+from ml_flashpoint.adapter.megatron.save_utils import save_local_aware_megatron_checkpoint
 ```
 
 #### Save Strategy
@@ -150,8 +151,19 @@ megatron_save_strategy = MLFlashpointMegatronAsyncSaveStrategy(
 )
 ```
 
-Because Megatron's `dist_checkpointing.save()` function writes "common" data only on global rank 0, which does not align with local checkpointing, you can orchestrate saves using the save strategy the same way it's done in [`MLFlashpointCheckpointIO.save_checkpoint()`](https://github.com/google/ml-flashpoint/blob/b9767583520106f59743b9e8050769523cfbef6e/src/ml_flashpoint/adapter/nemo/checkpoint_io.py#L137-L171) in the `ml_flashpoint.adapter.nemo` package.
-You'll notice that the logic there aims to mimic `dist_checkpointing.save`, but it saves common data on each node (via local rank 0) as opposed to solely on the coordinator node (global rank 0).
+Because Megatron's `dist_checkpointing.save()` function writes "common" data only on global rank 0, which does not align with local checkpointing, use the provided helper function `save_local_aware_megatron_checkpoint()` from the `ml_flashpoint.adapter.megatron.save_utils` module.
+
+This helper mimics `dist_checkpointing.save()`, but saves common data on each node (via local rank 0) rather than solely on the coordinator node (global rank 0).
+
+```python
+# In your save loop
+async_request = save_local_aware_megatron_checkpoint(
+    checkpoint=state_dict,
+    checkpoint_dir=str(curr_step_checkpoint_id),
+    save_strategy=megatron_save_strategy,
+    async_save=True,
+)
+```
 
 !!! note
 
 
@@ -19,8 +19,22 @@
 # ===================================================================
 [project]
 name = "ml-flashpoint"
-version = "0.0.0"
+dynamic = [ "version" ]
 description = "A memory-first, lightning fast, easy-to-use ML checkpointing library."
+readme = "README.md"
+license = { file = "LICENSE" }
+classifiers = [
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: POSIX :: Linux",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
 
 # Specifies the minimum version of Python required to install and run this package.
 requires-python = ">=3.10"
@@ -101,6 +115,7 @@ requires = [
     "scikit-build-core==0.11.6",
     "cmake==3.31.10",
     "ninja==1.11.1.3",
+    "setuptools-scm==9.2.2",
 ]
 
 # The Python object that `pip` will call to execute the build.
@@ -114,6 +129,9 @@ build-backend = "scikit_build_core.build"
 # ===================================================================
 [tool.scikit-build]
 
+# Tells scikit-build-core to use setuptools-scm to retrieve the version from git.
+metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
+
 # Specifies the minimum version of CMake that must be present on the system.
 cmake.version = ">=3.18"
 
@@ -134,6 +152,14 @@ cmake.source-dir = "."
 # https://scikit-build-core.readthedocs.io/en/latest/configuration/index.html#customizing-the-built-wheel
 # wheel.packages = ["src/ml_flashpoint"]
 
+# ===================================================================
+# Tool-specific Configuration for setuptools-scm
+# ===================================================================
+[tool.setuptools_scm]
+# Fallback version to use if git is not available or the directory is not a git repo.
+# This prevents build failures in environments like some CI runners or /tmp clones.
+fallback_version = "0.0.0"
+
 # ===================================================================
 # Tool-specific Configuration for Ruff
 # ===================================================================
 
@@ -12,3 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ml_flashpoint.adapter.megatron.save_utils import (
+    save_local_aware_megatron_checkpoint as save_local_aware_megatron_checkpoint,
+)
@@ -19,6 +19,7 @@
 from pathlib import Path
 from typing import Union
 
+import torch
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from megatron.core.dist_checkpointing.strategies.async_utils import AsyncRequest
 from megatron.core.dist_checkpointing.strategies.base import AsyncSaveShardedStrategy
@@ -32,7 +33,7 @@
 
 from ml_flashpoint.adapter.pytorch import custom_state_dict_saver as statedictsaver
 from ml_flashpoint.adapter.pytorch.memory_storage_writer import MemoryStorageWriter
-from ml_flashpoint.core import utils
+from ml_flashpoint.core import mlf_logging, utils
 from ml_flashpoint.core.checkpoint_id_types import CheckpointContainerId
 from ml_flashpoint.core.checkpoint_saver import MLFlashpointCheckpointSaver, ObjectWriteBucket
 from ml_flashpoint.core.mlf_logging import get_logger
@@ -41,6 +42,26 @@
 _LOGGER = get_logger(__name__)
 
 
+def _save_checkpoint(
+    staged_buckets: list[ObjectWriteBucket],
+    checkpoint_id: CheckpointContainerId,
+    storage_writer: MemoryStorageWriter,
+    rank: int,
+    step: int,
+):
+    """
+    This function is the 'async_fn' run in Megatron's :class:`AsyncRequest`.
+    """
+
+    mlf_logging.setup_worker_logging(rank, step)
+    statedictsaver.write_data(
+        checkpoint_id=checkpoint_id,
+        storage_writer=storage_writer,
+        staged_write_buckets=staged_buckets,
+        replicate_after_write=False,
+    )
+
+
 def default_backend_format_name() -> str:
     return "ml_flashpoint"
 
@@ -105,7 +126,7 @@ def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union
         # 1b. Re-initialize the StorageWriter to use a new instance per save to avoid hangs from shared state.
         self._storage_writer = MemoryStorageWriter(
             checkpoint_saver=self._checkpoint_saver,
-            mp_manager=self._storage_writer._mp_manager,
+            mp_manager=self._storage_writer._main_process_torchmp_manager,
             thread_count=self._storage_writer._thread_count,
         )
         # 1c. Reset the StorageWriter for this checkpoint version.
@@ -156,17 +177,6 @@ def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union
         with open(os.path.join(checkpoint_dir, "metadata.json"), "w") as f:
             json.dump(metadata, f)
 
-        def _save_checkpoint(staged_buckets: list[ObjectWriteBucket]):
-            """
-            This function is the 'async_fn' run in Megatron's :class:`AsyncRequest`.
-            """
-            statedictsaver.write_data(
-                checkpoint_id=checkpoint_id,
-                storage_writer=self._storage_writer,
-                staged_write_buckets=staged_buckets,
-                replicate_after_write=False,
-            )
-
         finalize_fns = [
             # Replicate written objects
             partial(
@@ -188,9 +198,18 @@ def _save_checkpoint(staged_buckets: list[ObjectWriteBucket]):
             ),
         ]
 
+        current_rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else -1
+        current_step = mlf_logging.get_current_step()
+
         return AsyncRequest(
             async_fn=_save_checkpoint,
             async_fn_args=(),
-            async_fn_kwargs={"staged_buckets": staged_write_buckets},
+            async_fn_kwargs={
+                "staged_buckets": staged_write_buckets,
+                "checkpoint_id": checkpoint_id,
+                "storage_writer": self._storage_writer,
+                "rank": current_rank,
+                "step": current_step,
+            },
             finalize_fns=finalize_fns,
         )