Skip to content

Commit 841d4d0

Browse files
ravwojdylaclaude
andcommitted
Wire ar5iv_step into exp934 as StepSpec dependency
Same pattern as Wikipedia: download step with override pointing at existing data, transform step as StepSpec with download as dep. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8f86929 commit 841d4d0

1 file changed

Lines changed: 26 additions & 15 deletions

File tree

experiments/exp934_hq_vs_pt.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
datasets used by various training experiments.
99
"""
1010

11+
from marin.datakit.download.ar5iv import ar5iv_step
1112
from marin.datakit.download.wikipedia import download_wikipedia_step
1213
from marin.execution.executor import ExecutorStep, mirrored, this_output_path, versioned
1314
from marin.execution.step_spec import StepSpec
@@ -72,23 +73,33 @@
7273
)
7374
wikipedia_resiliparse_custom_fork = _wikipedia_transform.as_executor_step().cd("20241201")
7475

75-
# ar5iv resiliparse custom fork step (data already exists at hardcoded path)
76-
ar5iv_no_problem_resiliparse_custom_fork = ExecutorStep(
76+
_ar5iv_download = ar5iv_step(
77+
input_path="gs://marin-us-central2/raw/ar5iv/ar5iv-04-2024-no-problem.zip",
78+
override_output_path="raw/ar5iv/ar5iv-04-2024-no-problem-49c4e3",
79+
)
80+
81+
# ar5iv resiliparse custom fork step
82+
_ar5iv_transform = StepSpec(
7783
name="documents/ar5iv/ar5iv-04-2024-no-problem",
78-
fn=process_ar5iv_dump,
79-
config=Ar5ivExtractionConfig(
80-
input_path=mirrored("raw/ar5iv/ar5iv-04-2024-no-problem-49c4e3/202404", budget_gb=1),
81-
revision="042024",
82-
output_path=this_output_path("resiliparse-custom-fork"),
83-
extract_method=versioned("resiliparse"),
84-
extract_config=ResiliparseConfig(
85-
links=versioned(False),
86-
prepend_title=True,
87-
skip_elements=ARXIV_BLACKLISTED_SELECTORS,
88-
),
89-
remove_reference_section=versioned(True),
84+
fn=lambda output_path: process_ar5iv_dump(
85+
Ar5ivExtractionConfig(
86+
input_path=f"{_ar5iv_download.output_path}/202404",
87+
revision="042024",
88+
output_path=output_path,
89+
extract_method="resiliparse",
90+
extract_config=ResiliparseConfig(
91+
links=False,
92+
prepend_title=True,
93+
skip_elements=ARXIV_BLACKLISTED_SELECTORS,
94+
),
95+
remove_reference_section=True,
96+
)
9097
),
91-
).with_output_path("documents/ar5iv/ar5iv-04-2024-no-problem-3971f")
98+
deps=[_ar5iv_download],
99+
hash_attrs={"revision": "042024", "extract_method": "resiliparse"},
100+
override_output_path="documents/ar5iv/ar5iv-04-2024-no-problem-3971f",
101+
)
102+
ar5iv_no_problem_resiliparse_custom_fork = _ar5iv_transform.as_executor_step()
92103

93104
# MMLU Science QA tokenization
94105
medu_mmlu_science_qa_tokenized = default_tokenize(

0 commit comments

Comments
 (0)