Write Tokenized Data Sizes as metadata (#2431)

Helw150 · web-flow · commit c1cb8359b7af · 2026-01-23T03:36:26.000Z
Writes out the token size info alongside the tokenized data itself (request from https://discord.com/channels/1354881461060243556/1366632114316906506/1458962443542724785). This doesn't help for already tokenized data, but means moving forward that reasonable stats will live alongside the data itself so it can be accessed easily to compute things like epochs.
diff --git a/lib/marin/src/marin/processing/tokenize/tokenize.py b/lib/marin/src/marin/processing/tokenize/tokenize.py
@@ -21,12 +21,15 @@
 
 import abc
 import dataclasses
+import json
 import logging
 import os
 import re
 from collections.abc import Iterator, Sequence
 from typing import Any
 
+import fsspec
+
 import draccus
 from fray.job.context import JobContext
 import humanfriendly
@@ -367,6 +370,23 @@ def run_pipeline(paths: list[str], split_name: str) -> None:
             shard_cache_paths=shard_paths, output_path=prefix, exemplar=exemplar, context=cluster_ctx
         )
 
+        # Aggregate token counts from shard stats
+        total_tokens = 0
+        total_elements = 0
+        for shard_path in shard_paths:
+            stats_path = f"{shard_path}/.stats.json"
+            with fsspec.open(stats_path) as f:
+                stats = json.load(f)
+                total_tokens += stats.get("token_count", 0)
+                total_elements += stats.get("num_rows", 0)
+
+        stats_path = os.path.join(prefix, ".stats.json")
+        logger.info(
+            f"Writing total token count ({total_tokens:,}) and element count ({total_elements:,}) to {stats_path}"
+        )
+        with fsspec.open(stats_path, "w") as f:
+            json.dump({"total_tokens": total_tokens, "total_elements": total_elements}, f)
+
     if train_paths:
         run_pipeline(train_paths, "train")
 
diff --git a/lib/zephyr/src/zephyr/writers.py b/lib/zephyr/src/zephyr/writers.py
@@ -18,6 +18,7 @@
 
 from dataclasses import asdict, is_dataclass
 import itertools
+import json
 import os
 from collections.abc import Iterable
 from contextlib import contextmanager
@@ -252,21 +253,28 @@ def write_levanter_cache(records: Iterable[dict[str, Any]], output_path: str, me
     try:
         exemplar = next(iter(records))
     except StopIteration:
-        return {"path": output_path, "count": 0}
+        return {"path": output_path, "count": 0, "token_count": 0}
 
     count = 1
+    token_count = len(exemplar.get("input_ids", []))
     with atomic_rename(output_path) as tmp_path:
         with SerialCacheWriter(tmp_path, exemplar, shard_name=output_path, metadata=CacheMetadata(metadata)) as writer:
             writer.write_batch([exemplar])
             for batch in batchify(records):
                 writer.write_batch(batch)
                 count += len(batch)
+                for record in batch:
+                    token_count += len(record.get("input_ids", []))
 
     # write success sentinel
     with fsspec.open(f"{output_path}/.success", "w") as f:
         f.write("")
 
-    return {"path": output_path, "count": count}
+    # write stats for aggregation
+    with fsspec.open(f"{output_path}/.stats.json", "w") as f:
+        json.dump({"count": count, "token_count": token_count}, f)
+
+    return {"path": output_path, "count": count, "token_count": token_count}
 
 
 def write_binary_file(records: Iterable[bytes], output_path: str) -> dict: