tokenize: fix Artifact.load -> Artifact.from_path rename misses (#5756)

ravwojdyla-agent · ravwojdyla · claude · web-flow · commit 6b640d985fa7 · 2026-05-14T23:19:50.000-07:00
## Summary - PR #5727 renamed `Artifact.load` → `Artifact.from_path` in `marin.execution.artifact` but left three call sites in `lib/marin/src/marin/processing/tokenize/` calling the now-missing method. - Both `tokenize_attributes_step` and `build_levanter_store_step` blow up at runtime with `AttributeError: type object 'Artifact' has no attribute 'load'` the first time their `_fn` closure runs (i.e. inside the StepRunner, not at module import). - Rename the 3 call sites + 2 stale docstring references to match. ## Test plan - [x] `uv run pytest tests/processing/tokenize/test_split_tokenize.py` — 18 passed (existing tests, no new ones). - [x] `uv run pyrefly check` — clean. - [x] `./infra/pre-commit.py --files ...` — clean. - [x] Surfaced while running `experiments/tokenize/all_sources_tokenize.py` on Iris — every step hit the AttributeError immediately. Note: existing unit tests don't catch this because they exercise `tokenize_attributes` / `build_levanter_store` directly, not the `*_step` factory closures. Co-authored-by: Rafal Wojdyla <ravwojdyla@gmail.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/lib/marin/src/marin/processing/tokenize/attributes.py b/lib/marin/src/marin/processing/tokenize/attributes.py
@@ -55,7 +55,7 @@ class TokenizedAttrData(BaseModel):
     and co-partitioned with the source — both datakit invariants.
 
     Persisted as the step's ``.artifact``. Load via
-    ``Artifact.load(step, TokenizedAttrData)``.
+    ``Artifact.from_path(step, TokenizedAttrData)``.
 
     Attributes:
         version: Schema version.
@@ -281,9 +281,9 @@ def _fn(output_path: str) -> TokenizedAttrData:
             "max_workers": max_workers,
         }
         if train_normalize is not None:
-            kwargs["train_source"] = Artifact.load(train_normalize, NormalizedData)
+            kwargs["train_source"] = Artifact.from_path(train_normalize, NormalizedData)
         if validation_normalize is not None:
-            kwargs["validation_source"] = Artifact.load(validation_normalize, NormalizedData)
+            kwargs["validation_source"] = Artifact.from_path(validation_normalize, NormalizedData)
         if worker_resources is not None:
             kwargs["worker_resources"] = worker_resources
         return tokenize_attributes(TokenizeAttributesConfig(**kwargs))
diff --git a/lib/marin/src/marin/processing/tokenize/store_builder.py b/lib/marin/src/marin/processing/tokenize/store_builder.py
@@ -67,7 +67,7 @@ class LevanterStoreData(BaseModel):
     """Outcome of :func:`build_levanter_store`: a Levanter cache per split.
 
     Persisted as the step's ``.artifact``. Load via
-    ``Artifact.load(step, LevanterStoreData)``.
+    ``Artifact.from_path(step, LevanterStoreData)``.
 
     Attributes:
         version: Schema version.
@@ -354,7 +354,7 @@ def build_levanter_store_step(
 
     def _fn(output_path: str) -> LevanterStoreData:
         kwargs: dict = {
-            "sources": [Artifact.load(s, TokenizedAttrData) for s in tokenize_steps],
+            "sources": [Artifact.from_path(s, TokenizedAttrData) for s in tokenize_steps],
             "cache_path": output_path,
             "max_workers": max_workers,
             "levanter_batch_size": levanter_batch_size,