implemented changes requested by fabrice

qchapp · qchapp · commit 7946278ee10d · 2026-05-08T17:17:44.000+02:00
diff --git a/README.md b/README.md
@@ -315,7 +315,36 @@ Key metrics:
 
 Reference benchmark:
 - [DataTrove Benchmark](https://github.com/huggingface/datatrove/tree/main/examples/inference/benchmark)
-- `mmirage run --config configs/config_benchmark_datatrove.yaml --stats`
+
+The config `configs/config_benchmark_datatrove.yaml` mirrors the DataTrove inference benchmark conditions:
+
+| Setting | Value |
+|---|---|
+| Dataset | `simplescaling/s1K-1.1` (train split, 1 000 samples) |
+| Prompt | raw `question` field, no system prompt |
+| Output | up to 1 024 tokens per sample |
+| Context | 2 048-token model max context |
+| Model | `Qwen/Qwen3-4B` (DataTrove baseline: tp=1 on a single GPU) |
+
+Download the dataset before running:
+
+```python
+from datasets import load_dataset
+ds = load_dataset('simplescaling/s1K-1.1', split='train')
+ds.save_to_disk('data/s1K-1.1')
+```
+
+Then run with stats collection enabled:
+
+```bash
+mmirage run --config configs/config_benchmark_datatrove.yaml --stats
+```
+
+Inspect results:
+
+```bash
+mmirage stats --config configs/config_benchmark_datatrove.yaml
+```
 
 ## Architecture
 
diff --git a/configs/config_benchmark_datatrove.yaml b/configs/config_benchmark_datatrove.yaml
@@ -1,29 +1,5 @@
 # MMIRAGE — DataTrove-compatible throughput benchmark
-#
-# Mirrors the conditions used in the DataTrove inference benchmark
-# (https://github.com/huggingface/datatrove/tree/main/examples/inference/benchmark):
-#
-#   dataset : simplescaling/s1K-1.1  (train split, 1 000 samples)
-#   prompt  : raw `question` field, no system prompt
-#   output  : up to 1 024 tokens per sample
-#   context : 2 048-token model max context
-#   model   : Qwen/Qwen3-4B  (DataTrove baseline: tp=1 on a single GPU)
-#
-# Download the dataset before running:
-#
-#   python -c "
-#   from datasets import load_dataset
-#   ds = load_dataset('simplescaling/s1K-1.1', split='train')
-#   ds.save_to_disk('data/s1K-1.1')
-#   "
-#
-# Then run with stats collection enabled:
-#
-#   mmirage run --config configs/config_benchmark_datatrove.yaml --stats
-#
-# Inspect results:
-#
-#   mmirage stats --config configs/config_benchmark_datatrove.yaml
+# See README.md for setup instructions and benchmark details.
 
 processors:
   - type: llm
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
   "jmespath",
   "jinja2>=3.0.0",
   "pillow>=9.0.0",
+  "humanize>=4.0.0",
 ]
 
 [project.optional-dependencies]
diff --git a/src/mmirage/core/process/mapper.py b/src/mmirage/core/process/mapper.py
@@ -1,10 +1,19 @@
 """Mapper for orchestrating variable transformations."""
 
+from dataclasses import dataclass
 from typing import Dict, Any, List, cast
 
 from mmirage.core.process.variables import BaseVar, InputVar, OutputVar
 from mmirage.core.process.base import AutoProcessor, BaseProcessor, BaseProcessorConfig
 
+
+@dataclass
+class TokenCounts:
+    """Cumulative token counts from LLM processors."""
+
+    input_tokens: int
+    output_tokens: int
+
 import logging
 
 from mmirage.core.process.variables import VariableEnvironment
@@ -104,14 +113,14 @@ def rewrite_batch(
 
         return batch_environment
 
-    def get_token_counts(self) -> Dict[str, int]:
+    def get_token_counts(self) -> TokenCounts:
         """Return cumulative token counts aggregated across all LLM processors.
 
         Sums ``input_tokens`` and ``output_tokens`` from every processor that
         exposes a ``get_token_counts()`` method (i.e., ``LLMProcessor``).
 
         Returns:
-            Dict with ``input_tokens`` and ``output_tokens`` keys.
+            TokenCounts with ``input_tokens`` and ``output_tokens`` fields.
         """
         total_input = 0
         total_output = 0
@@ -120,7 +129,7 @@ def get_token_counts(self) -> Dict[str, int]:
                 counts = proc.get_token_counts()
                 total_input += counts.get("input_tokens", 0)
                 total_output += counts.get("output_tokens", 0)
-        return {"input_tokens": total_input, "output_tokens": total_output}
+        return TokenCounts(input_tokens=total_input, output_tokens=total_output)
 
     def get_load_time(self) -> float:
         """Return total model-loading time (seconds) summed across all LLM processors."""
diff --git a/src/mmirage/shard_process.py b/src/mmirage/shard_process.py
@@ -109,7 +109,7 @@ def main():
             all_visible = [x.strip() for x in cuda_visible.split(",") if x.strip()]
             # Fall back to range-based indices if CUDA_VISIBLE_DEVICES was set
             # but contained only whitespace/empty entries after stripping.
-            gpu_indices_for_polling: Optional[List[str]] = all_visible[:tp_size] if all_visible else [str(i) for i in range(tp_size)]
+            gpu_indices_for_polling: List[str] = all_visible[:tp_size] if all_visible else [str(i) for i in range(tp_size)]
         else:
             gpu_indices_for_polling = [str(i) for i in range(tp_size)]
         gpu_poller: GpuUtilizationPoller = GpuUtilizationPoller(
@@ -184,8 +184,8 @@ def main():
 
         # Collect token counts accumulated by LLM processor(s).
         token_counts = mapper.get_token_counts()
-        input_tokens = token_counts["input_tokens"] or None
-        output_tokens = token_counts["output_tokens"] or None
+        input_tokens = token_counts.input_tokens or None
+        output_tokens = token_counts.output_tokens or None
         model_load_seconds = mapper.get_load_time() or None
 
         # Resolve num_gpus from the first processor config that exposes tp_size.
diff --git a/src/mmirage/shard_utils.py b/src/mmirage/shard_utils.py
@@ -6,6 +6,7 @@
 
 from datetime import datetime
 from dataclasses import dataclass
+import humanize
 import json
 import logging
 import os
@@ -24,24 +25,10 @@
 
 
 def format_duration(seconds: Optional[float]) -> Optional[str]:
-    """Format a duration given in seconds as a human-readable string.
-
-    Examples::
-
-        format_duration(45.3)     -> "45s"
-        format_duration(125.0)    -> "2m 5s"
-        format_duration(3725.0)   -> "1h 2m 5s"
-    """
+    """Format a duration given in seconds as a human-readable string."""
     if seconds is None:
         return None
-    total = int(seconds)
-    hours, remainder = divmod(total, 3600)
-    minutes, secs = divmod(remainder, 60)
-    if hours:
-        return f"{hours}h {minutes}m {secs}s"
-    if minutes:
-        return f"{minutes}m {secs}s"
-    return f"{secs}s"
+    return humanize.precisedelta(seconds)
 
 
 @dataclass

Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ dependencies = [`
`38`	`38`	`"jmespath",`
`39`	`39`	`"jinja2>=3.0.0",`
`40`	`40`	`"pillow>=9.0.0",`
	`41`	`+ "humanize>=4.0.0",`
`41`	`42`	`]`
`42`	`43`
`43`	`44`	`[project.optional-dependencies]`