AMD-AGI
diff --git a/‎primus/configs/modules/torchtitan/pre_trainer.yaml‎
Lines changed: 1 addition & 0 deletions b/‎primus/configs/modules/torchtitan/pre_trainer.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎primus/modules/trainer/torchtitan/patch_utils.py‎
Lines changed: 68 additions & 0 deletions b/‎primus/modules/trainer/torchtitan/patch_utils.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎primus/modules/trainer/torchtitan/pre_trainer.py‎
Lines changed: 22 additions & 1 deletion b/‎primus/modules/trainer/torchtitan/pre_trainer.py‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎tests/modules/__init__.py‎ b/‎tests/modules/__init__.py‎
diff --git a/‎tests/modules/trainer/__init__.py‎ b/‎tests/modules/trainer/__init__.py‎
diff --git a/‎tests/modules/trainer/torchtitan/__init__.py‎ b/‎tests/modules/trainer/torchtitan/__init__.py‎
diff --git a/‎tests/modules/trainer/torchtitan/test_patch_utils.py‎
Lines changed: 42 additions & 0 deletions b/‎tests/modules/trainer/torchtitan/test_patch_utils.py‎
Lines changed: 42 additions & 0 deletions
@@ -33,6 +33,7 @@ profiling:
   save_traces_folder: profile_traces
 
 training:
+  mock_data: true
   dataset: c4
   dataset_path: null
   deterministic: false
 
@@ -0,0 +1,68 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+import numpy as np
+from datasets import Dataset
+
+
+def _create_mock_text_dataset(num_samples: int = 128) -> Dataset:
+    """Create a lightweight text dataset for validation mock."""
+    texts = [f"validation sample {i}" for i in range(num_samples)]
+    return Dataset.from_dict({"text": texts})
+
+
+def _create_mock_token_dataset(
+    seq_len: int = 2048,
+    vocab_size: int = 32000,
+    num_samples: int = 256,
+) -> Dataset:
+    """
+    Create fake tokenized text dataset (Titan-compatible).
+
+    Each "text" field is a string of roughly `seq_len // 8` space-separated integers.
+    Titan's tokenizer.encode() will parse these into tokens and reconstruct
+    proper seq_len-sized sequences from multiple samples if needed.
+
+    This lightweight mock simulates a streaming dataset and avoids heavy memory usage.
+    """
+    rng = np.random.default_rng(42)
+    token_per_sample = seq_len  # shorter text, Titan will concatenate internally
+
+    samples = []
+    for _ in range(num_samples):
+        token_ids = rng.integers(0, vocab_size, size=token_per_sample, dtype=np.int32)
+        text = " ".join(map(str, token_ids))
+        samples.append({"text": text})
+
+    return Dataset.from_list(samples)
+
+
+def patch_mock_hf_dataset() -> None:
+    from primus.core.utils import logger
+
+    try:
+        import datasets
+
+        logger.warning("[Primus Mock] Enabling mock HuggingFace dataset mode.")
+
+        def mock_load_dataset(path: str, *args, **kwargs) -> Dataset:
+            """
+            Replacement for datasets.load_dataset().
+            Intercepts Titan calls like load_dataset('allenai/c4', ...).
+            Returns a fake Dataset of text samples.
+            """
+            logger.warning(f"[Primus Mock] load_dataset('{path}') is mocked.")
+            # Shorter dataset for validation split
+            if "validation" in path.lower():
+                return _create_mock_text_dataset(num_samples=32)
+            else:
+                return _create_mock_token_dataset(seq_len=8192, vocab_size=32000, num_samples=256)
+
+        datasets.load_dataset = mock_load_dataset
+        logger.warning("[PrimusPath][Dataset] Patched datasets.load_dataset successfully.")
+
+    except Exception as e:
+        logger.error(f"[PrimusPath][Dataset] Failed to patch datasets.load_dataset: {e}")
@@ -26,6 +26,14 @@ def __init__(self, *args, **kwargs):
         pre_trainer_cfg = self.primus_cfg.get_module_config("pre_trainer")
         cfg_dict = nested_namespace_to_dict(pre_trainer_cfg)
 
+        patch_mock = getattr(pre_trainer_cfg.training, "mock_data", False)
+        if patch_mock:
+            from primus.modules.trainer.torchtitan.patch_utils import (
+                patch_mock_hf_dataset,
+            )
+
+            patch_mock_hf_dataset()
+
         self.patch_torchtitan_embedding_amp(cfg_dict["primus_turbo"]["enable_embedding_autocast"])
         self.patch_titan_train_spec(pre_trainer_cfg.model.name, pre_trainer_cfg.model.flavor, extra_args)
 
@@ -460,15 +468,28 @@ def _dict_to_dataclass(self, cls, data: dict[str, Any]) -> Any:
         if not is_dataclass(cls):
             return data
 
+        # collect valid field names
+        field_names = {f.name for f in fields(cls)}
         init_values = {}
+
+        # only use known fields for constructor
         for f in fields(cls):
             if f.name in data:
                 val = data[f.name]
                 if is_dataclass(f.type) and isinstance(val, dict):
                     init_values[f.name] = self._dict_to_dataclass(f.type, val)
                 else:
                     init_values[f.name] = val
-        return cls(**init_values)
+
+        # instantiate dataclass
+        obj = cls(**init_values)
+
+        # attach unknown fields dynamically
+        for k, v in data.items():
+            if k not in field_names:
+                setattr(obj, k, v)
+
+        return obj
 
     def patch_torchtitan_embedding_amp(self, enable_patch: bool):
         """
 
@@ -0,0 +1,42 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+
+from primus.modules.trainer.torchtitan.patch_utils import patch_mock_hf_dataset
+from tests.utils import PrimusUT
+
+
+class TestTorchtitanPatch(PrimusUT):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        pass
+
+    def test_mock_hf_dataset_patch(self):
+        """
+        Test that enable_mock_hf_dataset() successfully patches datasets.load_dataset
+        and returns a fake HuggingFace Dataset.
+        """
+        # from primus.utils import mock_hf_dataset
+
+        patch_mock_hf_dataset()
+
+        # Reimport datasets and call load_dataset
+        import datasets
+
+        ds = datasets.load_dataset("allenai/c4", split="train")
+
+        # Verify that this is an in-memory Dataset with expected content
+        assert isinstance(ds, datasets.Dataset)
+        assert "text" in ds.column_names
+        assert len(ds) > 0
+        sample = ds[0]
+        assert isinstance(sample["text"], str)
+        assert len(sample["text"].split()) > 0