test(torchtitan): add model unit tests for TorchTitan backend (#256)

Xiaoming-AMD · web-flow · commit ee0dec929939 · 2025-10-25T09:19:15.000+08:00
diff --git a/examples/torchtitan/configs/MI300X/llama3.1_405B-pretrain.yaml b/examples/torchtitan/configs/MI300X/llama3.1_405B-pretrain.yaml
@@ -18,6 +18,9 @@ modules:
       training:
         local_batch_size: 2
 
+      metrics:
+        log_freq: 1
+
       optimizer:
         lr: 8.0e-5
 
diff --git a/examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml b/examples/torchtitan/configs/MI300X/llama3.1_70B-BF16-pretrain.yaml
@@ -20,7 +20,7 @@ modules:
         warmup_steps: 10
 
       metrics:
-        log_freq: 10
+        log_freq: 1
 
       training:
         local_batch_size: 4
diff --git a/examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml b/examples/torchtitan/configs/MI300X/llama3.1_70B-FP8-pretrain.yaml
@@ -20,7 +20,7 @@ modules:
         warmup_steps: 10
 
       metrics:
-        log_freq: 10
+        log_freq: 1
 
       training:
         local_batch_size: 3
diff --git a/examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml b/examples/torchtitan/configs/MI300X/llama3.1_8B-BF16-pretrain.yaml
@@ -16,7 +16,7 @@ modules:
       stderr_sink_level: INFO
 
       metrics:
-        log_freq: 10
+        log_freq: 1
         enable_wandb: false
 
       lr_scheduler:
diff --git a/examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml b/examples/torchtitan/configs/MI300X/llama3.1_8B-FP8-pretrain.yaml
@@ -20,7 +20,7 @@ modules:
         warmup_steps: 10
 
       metrics:
-        log_freq: 10
+        log_freq: 1
 
       training:
         local_batch_size: 4
diff --git a/examples/torchtitan/configs/MI300X/qwen3_0.6B-pretrain.yaml b/examples/torchtitan/configs/MI300X/qwen3_0.6B-pretrain.yaml
@@ -20,6 +20,9 @@ modules:
       lr_scheduler:
         warmup_steps: 2  # lr scheduler warm up, 20% total steps
 
+      metrics:
+        log_freq: 1
+
       training:
         local_batch_size: 4
         seq_len: 4096
diff --git a/examples/torchtitan/configs/MI300X/qwen3_1.7B-pretrain.yaml b/examples/torchtitan/configs/MI300X/qwen3_1.7B-pretrain.yaml
@@ -12,6 +12,9 @@ modules:
     model: qwen3_1.7b.yaml
     overrides:
 
+      metrics:
+        log_freq: 1
+
       optimizer:
         name: "AdamW"
         lr: 8.0e-4
diff --git a/examples/torchtitan/configs/MI300X/qwen3_32B-pretrain.yaml b/examples/torchtitan/configs/MI300X/qwen3_32B-pretrain.yaml
@@ -12,6 +12,9 @@ modules:
     model: qwen3_32b.yaml
     overrides:
 
+      metrics:
+        log_freq: 1
+
       optimizer:
         name: "AdamW"
         lr: 3.0e-4
diff --git a/examples/torchtitan/prepare.py b/examples/torchtitan/prepare.py
@@ -12,9 +12,6 @@
 from pathlib import Path
 from typing import Optional
 
-from huggingface_hub import snapshot_download
-from requests.exceptions import HTTPError
-
 from examples.scripts.utils import (
     get_env_case_insensitive,
     get_node_rank,
@@ -25,22 +22,6 @@
 from primus.core.launcher.parser import PrimusParser
 
 
-def hf_download(repo_id: str, local_dir: str, hf_token: Optional[str] = None) -> None:
-    try:
-        snapshot_download(
-            repo_id=repo_id,
-            local_dir=local_dir,
-            local_dir_use_symlinks=False,
-            token=hf_token,
-            ignore_patterns=["*.bin", "*.pt", "*.safetensors"],
-        )
-    except HTTPError as e:
-        if e.response.status_code == 401:
-            log_error_and_exit("You need to pass a valid `HF_TOKEN` to download private checkpoints.")
-        else:
-            raise e
-
-
 def parse_args():
     parser = argparse.ArgumentParser(description="Prepare Primus environment")
     parser.add_argument("--primus_path", type=str, required=True, help="Root path to the Primus project")
@@ -85,6 +66,75 @@ def resolve_backend_path(
     return path
 
 
+def run_titan_hf_download(
+    torchtitan_path: Path, repo_id: str, local_dir: Path, hf_token: Optional[str] = None
+):
+    """Use Titan's own download_hf_assets.py to fetch tokenizer/model assets."""
+    script_path = torchtitan_path / "scripts" / "download_hf_assets.py"
+    if not script_path.is_file():
+        log_error_and_exit(f"TorchTitan script not found: {script_path}")
+
+    cmd = [
+        "python",
+        str(script_path),
+        "--repo_id",
+        repo_id,
+        "--assets",
+        "tokenizer",
+        "--local_dir",
+        str(local_dir),
+    ]
+    env = os.environ.copy()
+    if hf_token:
+        env["HF_TOKEN"] = hf_token
+
+    log_info(f"[rank0] Running Titan HF downloader:\n  {' '.join(cmd)}")
+    ret = subprocess.run(cmd, env=env, cwd=torchtitan_path)
+    if ret.returncode != 0:
+        log_error_and_exit(f"TorchTitan HF download failed with code {ret.returncode}")
+
+
+def resolve_hf_assets_path(data_path: Path, hf_assets_value: str) -> tuple[str, Path, bool]:
+    """
+    Resolve HuggingFace asset source — supports both repo IDs and local paths.
+
+    Args:
+        data_path (Path):
+            Base data directory (e.g., /data/primus_data).
+        hf_assets_value (str):
+            Can be either:
+              - A HuggingFace repo ID (e.g., "meta-llama/Llama-3.1-70B")
+              - A local directory path (e.g., "/data/primus_data/torchtitan/Llama-3.1-70B")
+
+    Returns:
+        (repo_or_path, local_dir, need_download)
+            repo_or_path: str — repo_id if remote; same path if local
+            local_dir: Path — where assets are or will be located
+            need_download: bool — True if download is required
+
+    Behavior:
+        1. If hf_assets_value is an existing directory path:
+               → Treat it as an already downloaded local path.
+        2. If it is not an existing directory (likely a repo_id):
+               → Derive the local target dir as
+                   data_path / "torchtitan" / <last_component_of_repo_id>
+               → Mark need_download=True.
+    """
+    path_candidate = Path(hf_assets_value).expanduser()
+
+    # Case 1: already-downloaded local directory
+    if path_candidate.exists() and path_candidate.is_dir():
+        log_info(f"Detected local HF assets path: {path_candidate}")
+        return hf_assets_value, path_candidate.resolve(), False
+
+    # Case 2: repo_id (e.g., meta-llama/Llama-3.1-70B) → need to download
+    repo_id = hf_assets_value
+    repo_name = Path(repo_id).name  # last segment, e.g., Llama-3.1-70B
+    local_dir = data_path / "torchtitan" / repo_name
+    log_info(f"Resolved HF repo_id={repo_id}, local_dir={local_dir}")
+    return repo_id, local_dir, True
+
+
 def main():
     args = parse_args()
 
@@ -120,28 +170,35 @@ def main():
     if not hasattr(pre_trainer_cfg.model, "hf_assets_path") or not pre_trainer_cfg.model.hf_assets_path:
         log_error_and_exit("Missing required field: pre_trainer.model.tokenizer_path")
 
-    hf_assets_path = pre_trainer_cfg.model.hf_assets_path
-
-    full_path = data_path / "torchtitan" / hf_assets_path.lstrip("/")
+    hf_assets_value = pre_trainer_cfg.model.hf_assets_path
+    repo_id, local_dir, need_download = resolve_hf_assets_path(data_path, hf_assets_value)
+    tokenizer_file = local_dir / "tokenizer.json"
 
-    tokenizer_test_file = full_path / "tokenizer.json"
-    if not tokenizer_test_file.is_file():
+    if need_download:
+        # Remote repo_id case — download via Titan script
         hf_token = os.environ.get("HF_TOKEN")
         if not hf_token:
-            log_error_and_exit("HF_TOKEN not set. Please export HF_TOKEN.")
+            log_error_and_exit("HF_TOKEN not set. Please export HF_TOKEN before running prepare.")
 
         if get_node_rank() == 0:
-            log_info(f"Downloading HF assets for tokenizer to {full_path} ...")
-            full_path.mkdir(parents=True, exist_ok=True)
-            hf_download(repo_id=hf_assets_path, local_dir=str(full_path), hf_token=hf_token)
+            if not tokenizer_file.exists():
+                log_info(f"Downloading HF assets from repo={repo_id} into {local_dir} ...")
+                parent_dir = local_dir.parent
+                parent_dir.mkdir(parents=True, exist_ok=True)
+                run_titan_hf_download(torchtitan_path, repo_id, parent_dir, hf_token)
+            else:
+                log_info(f"Tokenizer assets already exist: {tokenizer_file}")
         else:
-            log_info(f"Rank {get_node_rank()} waiting for tokenizer download ...")
-            while not tokenizer_test_file.exists():
+            # Other ranks wait until the file is available
+            log_info(f"[rank{get_node_rank()}] waiting for tokenizer download ...")
+            while not tokenizer_file.exists():
                 time.sleep(5)
     else:
-        log_info(f"Tokenizer assets already exist: {tokenizer_test_file}")
+        # Local path case — skip download
+        log_info(f"HF assets already available locally at {local_dir}")
 
-    write_patch_args(patch_args_file, "train_args", {"model.hf_assets_path": str(full_path)})
+    # Pass resolved path to training phase
+    write_patch_args(patch_args_file, "train_args", {"model.hf_assets_path": str(local_dir)})
     write_patch_args(patch_args_file, "train_args", {"backend_path": str(torchtitan_path)})
     write_patch_args(patch_args_file, "torchrun_args", {"local-ranks-filter": "1"})
 
diff --git a/tests/run_unit_tests.py b/tests/run_unit_tests.py
@@ -20,11 +20,20 @@ def get_all_unit_tests():
     cur_dir = "./tests"
     unit_tests = {}
 
+    EXCLUDE_UNIT_TESTS = [
+        "unit_tests/megatron/cco/test_tp_overlap.py",
+    ]
+
     for root, dirs, files in os.walk(cur_dir):
         for file_name in files:
             if not file_name.endswith(".py") or not file_name.startswith("test_"):
                 continue
 
+            # Construct relative path from tests/
+            rel_path = os.path.relpath(os.path.join(root, file_name), start=cur_dir)
+            if rel_path in EXCLUDE_UNIT_TESTS:
+                continue
+
             if file_name not in DISTRIBUTED_UNIT_TESTS:
                 unit_tests[os.path.join(root, file_name)] = 1
             else:
diff --git a/tests/trainer/test_megatron_trainer.py b/tests/trainer/test_megatron_trainer.py
@@ -268,13 +268,13 @@ def test_interleaved_pipeline_parallelism(self):
             ],
         )
 
-    def test_zero_bubble_pipeline_parallelism(self):
-        run_script(
-            self.__class__.__name__,
-            "zero_bubble_pipeline_parallelism",
-            exp_path="tests/trainer/test_megatron_trainer_zero_bubble.yaml",
-            env_override={},
-        )
+    # def test_zero_bubble_pipeline_parallelism(self):
+    #     run_script(
+    #         self.__class__.__name__,
+    #         "zero_bubble_pipeline_parallelism",
+    #         exp_path="tests/trainer/test_megatron_trainer_zero_bubble.yaml",
+    #         env_override={},
+    #     )
 
     def test_turbo_deepep(self):
         run_script(
diff --git a/tests/trainer/test_torchtitan_trainer.py b/tests/trainer/test_torchtitan_trainer.py