|
8 | 8 | datasets used by various training experiments. |
9 | 9 | """ |
10 | 10 |
|
| 11 | +from marin.datakit.download.ar5iv import ar5iv_step |
11 | 12 | from marin.datakit.download.wikipedia import download_wikipedia_step |
12 | 13 | from marin.execution.executor import ExecutorStep, mirrored, this_output_path, versioned |
13 | 14 | from marin.execution.step_spec import StepSpec |
|
72 | 73 | ) |
73 | 74 | wikipedia_resiliparse_custom_fork = _wikipedia_transform.as_executor_step().cd("20241201") |
74 | 75 |
|
75 | | -# ar5iv resiliparse custom fork step (data already exists at hardcoded path) |
76 | | -ar5iv_no_problem_resiliparse_custom_fork = ExecutorStep( |
| 76 | +_ar5iv_download = ar5iv_step( |
| 77 | + input_path="gs://marin-us-central2/raw/ar5iv/ar5iv-04-2024-no-problem.zip", |
| 78 | + override_output_path="raw/ar5iv/ar5iv-04-2024-no-problem-49c4e3", |
| 79 | +) |
| 80 | + |
| 81 | +# ar5iv resiliparse custom fork step |
| 82 | +_ar5iv_transform = StepSpec( |
77 | 83 | name="documents/ar5iv/ar5iv-04-2024-no-problem", |
78 | | - fn=process_ar5iv_dump, |
79 | | - config=Ar5ivExtractionConfig( |
80 | | - input_path=mirrored("raw/ar5iv/ar5iv-04-2024-no-problem-49c4e3/202404", budget_gb=1), |
81 | | - revision="042024", |
82 | | - output_path=this_output_path("resiliparse-custom-fork"), |
83 | | - extract_method=versioned("resiliparse"), |
84 | | - extract_config=ResiliparseConfig( |
85 | | - links=versioned(False), |
86 | | - prepend_title=True, |
87 | | - skip_elements=ARXIV_BLACKLISTED_SELECTORS, |
88 | | - ), |
89 | | - remove_reference_section=versioned(True), |
| 84 | + fn=lambda output_path: process_ar5iv_dump( |
| 85 | + Ar5ivExtractionConfig( |
| 86 | + input_path=f"{_ar5iv_download.output_path}/202404", |
| 87 | + revision="042024", |
| 88 | + output_path=output_path, |
| 89 | + extract_method="resiliparse", |
| 90 | + extract_config=ResiliparseConfig( |
| 91 | + links=False, |
| 92 | + prepend_title=True, |
| 93 | + skip_elements=ARXIV_BLACKLISTED_SELECTORS, |
| 94 | + ), |
| 95 | + remove_reference_section=True, |
| 96 | + ) |
90 | 97 | ), |
91 | | -).with_output_path("documents/ar5iv/ar5iv-04-2024-no-problem-3971f") |
| 98 | + deps=[_ar5iv_download], |
| 99 | + hash_attrs={"revision": "042024", "extract_method": "resiliparse"}, |
| 100 | + override_output_path="documents/ar5iv/ar5iv-04-2024-no-problem-3971f", |
| 101 | +) |
| 102 | +ar5iv_no_problem_resiliparse_custom_fork = _ar5iv_transform.as_executor_step() |
92 | 103 |
|
93 | 104 | # MMLU Science QA tokenization |
94 | 105 | medu_mmlu_science_qa_tokenized = default_tokenize( |
|
0 commit comments