Skip to content

Commit 72c97a2

Browse files
ravwojdylaclaude
andcommitted
Remove unused datakit/tokenize.py module
Only consumer was the integration test, which now uses StepSpec with TokenizeConfig directly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 662c4e0 commit 72c97a2

2 files changed

Lines changed: 14 additions & 76 deletions

File tree

lib/marin/src/marin/datakit/tokenize.py

Lines changed: 0 additions & 71 deletions
This file was deleted.

tests/datakit/test_datakit.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212

1313
from marin.datakit.download.huggingface import download_hf_step
1414
from marin.datakit.normalize import content_hash_id, normalize_step
15-
from marin.datakit.tokenize import tokenize_step
1615
from marin.execution.step_runner import StepRunner
16+
from marin.execution.step_spec import StepSpec
17+
from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize
1718

1819

1920
@pytest.mark.slow
@@ -35,11 +36,19 @@ def test_download_normalize_tokenize(tmp_path):
3536
override_output_path=str(tmp_path / "normalized"),
3637
)
3738

38-
tok = tokenize_step(
39-
"datakit/tokenize",
40-
input_path=norm.output_path,
41-
tokenizer="gpt2",
39+
tok = StepSpec(
40+
name="datakit/tokenize",
41+
fn=lambda output_path: tokenize(
42+
TokenizeConfig(
43+
train_paths=[norm.output_path],
44+
validation_paths=[],
45+
cache_path=output_path,
46+
tokenizer="gpt2",
47+
allow_test_in_train=True,
48+
)
49+
),
4250
deps=[norm],
51+
hash_attrs={"tokenizer": "gpt2"},
4352
override_output_path=str(tmp_path / "tokenized"),
4453
)
4554

0 commit comments

Comments
 (0)