Skip to content

Commit cd83f6d

Browse files
committed
Fix isoflop_analysis and reference to use StepSpec for dedup steps
ExecutorStep.fn receives the config object, not the output path. Using StepSpec(...).as_executor_step() preserves the lambda(output_path) calling convention via the round-trip short-circuit in resolve_executor_step.
1 parent 6879ff2 commit cd83f6d

2 files changed

Lines changed: 6 additions & 5 deletions

File tree

experiments/dedup/isoflop_analysis.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
)
2424
from experiments.scaling_law_sweeps.c_adamc import create_isoflop_sweep_steps
2525
from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
26+
from marin.execution.step_spec import StepSpec
2627

2728
logger = logging.getLogger(__name__)
2829

@@ -61,14 +62,14 @@ def _get_vanilla_data_mixture(*, variant: str) -> LMMixtureDatasetConfig:
6162

6263
def _get_deduped_data_mixture(*, variant: str, mode: DedupMode, max_parallelism: int = 1024) -> LMMixtureDatasetConfig:
6364
"""Dedup fineweb-edu mixture"""
64-
dedup_step = ExecutorStep(
65+
dedup_step = StepSpec(
6566
name=f"dedup/{variant}_{mode.lower()}",
6667
fn=lambda op: _DEDUP_FN[mode](
6768
input_paths=downloads[variant],
6869
output_path=op,
6970
max_parallelism=max_parallelism,
7071
),
71-
)
72+
).as_executor_step()
7273

7374
dedup_mode_to_filter_type = {
7475
DedupMode.EXACT_PARAGRAPH: FilterType.REMOVE_SPANS,

experiments/dedup/reference.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55

66
from marin.execution.executor import ExecutorStep, InputName, executor_main
7+
from marin.execution.step_spec import StepSpec
78
from marin.processing.classification.deduplication.fuzzy import dedup_fuzzy_document
89

910
from experiments.pretraining_datasets.simple import downloads
@@ -12,15 +13,14 @@
1213

1314

1415
def build_dedup_step(dataset: InputName, max_parallelism: int) -> ExecutorStep:
15-
return ExecutorStep(
16+
return StepSpec(
1617
name=f"dedup_{dataset.name}",
1718
fn=lambda op: dedup_fuzzy_document(
1819
input_paths=dataset,
1920
output_path=op,
2021
max_parallelism=max_parallelism,
2122
),
22-
description=f"Run dedupe on {dataset.name}",
23-
)
23+
).as_executor_step()
2424

2525

2626
STEPS = [

0 commit comments

Comments
 (0)