|
20 | 20 | ) |
21 | 21 | from marin.execution.step_spec import StepSpec |
22 | 22 | from marin.processing.classification.consolidate import FilterConfig, FilterType, consolidate, ConsolidateConfig |
23 | | -from marin.processing.classification.dataset_utils import DatasetConfig |
24 | 23 | from marin.processing.classification.deduplication.exact import dedup_exact_paragraph |
25 | 24 | from marin.processing.classification.deduplication.fuzzy import dedup_fuzzy_document |
26 | | -from marin.processing.classification.fasttext.train_fasttext import ( |
27 | | - TrainFasttextClassifierConfig, |
28 | | - train, |
29 | | -) |
30 | | -from marin.processing.classification.inference import InferenceConfig, run_inference |
31 | 25 | from marin.processing.tokenize import lm_data_config |
32 | 26 | from marin.processing.tokenize.tokenize import TokenizeConfig, tokenize |
33 | 27 | from marin.schemas.web.convert import ResiliparseConfig |
@@ -153,64 +147,6 @@ def create_steps(prefix: str, synth_data: str) -> list[ExecutorStep]: |
153 | 147 | transform_hq_data_step = transform_hq_data_spec.as_executor_step() |
154 | 148 | transform_lq_data_step = transform_lq_data_spec.as_executor_step() |
155 | 149 |
|
156 | | - # ############################################################ |
157 | | - # Train quality classifier |
158 | | - |
159 | | - train_quality_step = ExecutorStep( |
160 | | - name=os.path.join(prefix, "quality-classifier"), |
161 | | - fn=train, |
162 | | - config=TrainFasttextClassifierConfig( |
163 | | - datasets=[ |
164 | | - DatasetConfig( |
165 | | - input_doc_path=transform_hq_data_step, |
166 | | - label="hq", |
167 | | - sampling_rate=1.0, |
168 | | - ), |
169 | | - DatasetConfig( |
170 | | - input_doc_path=transform_lq_data_step, |
171 | | - label="lq", |
172 | | - sampling_rate=1.0, |
173 | | - ), |
174 | | - ], |
175 | | - output_path=this_output_path(), |
176 | | - fasttext_args={ |
177 | | - "lr": 0.001, |
178 | | - "minCount": 1, |
179 | | - "epoch": 25, |
180 | | - "wordNgrams": 2, |
181 | | - "dim": 50, |
182 | | - "thread": 1, |
183 | | - }, |
184 | | - ), |
185 | | - ) |
186 | | - |
187 | | - ############################################################ |
188 | | - # Run inference with quality classifier |
189 | | - |
190 | | - inference_hq_step = ExecutorStep( |
191 | | - name=os.path.join(prefix, "hq-inference"), |
192 | | - fn=run_inference, |
193 | | - config=InferenceConfig( |
194 | | - input_path=transform_hq_data_step, |
195 | | - output_path=this_output_path(), |
196 | | - model_name=train_quality_step, |
197 | | - model_type="fasttext", |
198 | | - attribute_name="quickstart-fasttext-quality-hq", |
199 | | - ), |
200 | | - ) |
201 | | - |
202 | | - inference_lq_step = ExecutorStep( |
203 | | - name=os.path.join(prefix, "lq-inference"), |
204 | | - fn=run_inference, |
205 | | - config=InferenceConfig( |
206 | | - input_path=transform_lq_data_step, |
207 | | - output_path=this_output_path(), |
208 | | - model_name=train_quality_step, |
209 | | - model_type="fasttext", |
210 | | - attribute_name="quickstart-fasttext-quality-lq", |
211 | | - ), |
212 | | - ) |
213 | | - |
214 | 150 | ############################################################ |
215 | 151 | # Deduplicate (StepSpec — depends on transform StepSpecs) |
216 | 152 |
|
@@ -351,9 +287,6 @@ def create_steps(prefix: str, synth_data: str) -> list[ExecutorStep]: |
351 | 287 | return [ |
352 | 288 | transform_hq_data_step, |
353 | 289 | transform_lq_data_step, |
354 | | - train_quality_step, |
355 | | - inference_hq_step, |
356 | | - inference_lq_step, |
357 | 290 | dedup_exact_paragraph_step, |
358 | 291 | dedup_fuzzy_document_step, |
359 | 292 | validate_exact_dedup_step, |
|
0 commit comments