consolidate: drop unused CLASSIFY filter and percentile machinery

ravwojdyla · ravwojdyla · commit 978fb79b5dba · 2026-04-14T17:07:57.000-07:00
CLASSIFY was only exercised by tests and a broken quickstart yaml (no CLI
backs it). The two real callers — datakit_ferry and the integration test —
use REMOVE_DOC and REMOVE_SPANS. Removing the dead path drops ddsketch and
the percentile threshold code with it.
diff --git a/lib/marin/pyproject.toml b/lib/marin/pyproject.toml
@@ -12,7 +12,6 @@ dependencies = [
     "braceexpand",
     "cryptography>=45",
     "datasets<4.0.0",
-    "ddsketch",
     "deepdiff",
     "draccus>=0.11.5",
     "fasteners>=0.19",
diff --git a/lib/marin/src/marin/processing/classification/README.md b/lib/marin/src/marin/processing/classification/README.md
@@ -23,27 +23,5 @@ uv run python -m marin.processing.classification.decon \
 
 [`consolidate.py`](./consolidate.py) consumes attribute files and filters or rewrites documents. Supported filter types:
 
-- `classify`: keep or reject documents based on attribute scores
 - `remove_spans`: remove text spans such as duplicate paragraphs
 - `remove_docs`: drop whole documents when an attribute marks them as duplicates
-
-Example:
-
-```bash
-uv run python -m marin.processing.classification.consolidate \
-  --config_path lib/marin/src/marin/processing/classification/config/quickstart_consolidate_fasttext.yaml
-```
-
-Example `classify` filter:
-
-```yaml
-input_path: "gs://marin-us-central2/documents/hello_world_fw/v1.0/quickstart/"
-output_path: "gs://marin-us-central2/documents/hello_world_fw/v1.0/quickstart_fasttext_only/"
-
-filters:
-  - type: "classify"
-    attribute_path: "gs://marin-us-central2/attributes/hello_world_fw/v1.0/quickstart_olmo_fasttext/"
-    name: "olmo-fasttext-quality"
-    label: "__label__hq"
-    lower_threshold: 0.1
-```
diff --git a/lib/marin/src/marin/processing/classification/config/quickstart_consolidate_fasttext.yaml b/lib/marin/src/marin/processing/classification/config/quickstart_consolidate_fasttext.yaml
diff --git a/lib/marin/src/marin/processing/classification/consolidate.py b/lib/marin/src/marin/processing/classification/consolidate.py
@@ -4,9 +4,7 @@
 """
 Consolidate takes a set of documents with corresponding attributes and writes
 out a subset of the documents based on various filters defined with respect to
-the attributes.  Handles three cases:
-- Quality filtering produces attributes (e.g., fasttext-quality) with labels
-  (e.g., __label__hq), filter on threshold.
+the attributes.  Handles two cases:
 - Span removal produces attributes (e.g., duplicate_text spans). Remove text spans.
 - Document removal via attribute produced by deduplication.
 
@@ -19,8 +17,8 @@
 
 import logging
 import os
-from collections.abc import Callable, Iterator
-from dataclasses import dataclass, replace
+from collections.abc import Callable
+from dataclasses import dataclass
 from enum import StrEnum
 from typing import Any
 
@@ -34,7 +32,6 @@
 
 
 class FilterType(StrEnum):
-    CLASSIFY = "classify"
     REMOVE_SPANS = "remove_spans"
     REMOVE_DOC = "remove_docs"
 
@@ -55,21 +52,6 @@ class FilterConfig:
     name: str
     """Name of attribute to use for filtering."""
 
-    label: str | None = None
-    """The label under the attribute name."""
-
-    lower_threshold: float | None = None
-    """Keep documents where the value is above this."""
-
-    keep_fraction: float | None = None
-    """Keep documents where the score is in the top percentile. Calculates the threshold from the entire dataset."""
-
-    upper_threshold: float | None = None
-    """Keep documents where the value is below this."""
-
-    reverse: bool = False
-    """Reverse the filter."""
-
     attribute_filetype: str | None = None
     """File extension for attribute files (e.g. 'jsonl.gz', 'vortex'). If None, uses the input filetype."""
 
@@ -84,32 +66,6 @@ class FilterConfig:
 }
 
 
-def _is_valid(doc: dict, filt: FilterConfig, attributes: dict) -> bool:
-    assert filt.type == FilterType.CLASSIFY
-    attribute_value = attributes[filt.name]
-
-    # Handle nested attributes structure if a label is specified
-    if filt.label is not None:
-        if isinstance(attribute_value, dict) and filt.label in attribute_value:
-            value = attribute_value[filt.label]
-        else:
-            raise ValueError(f"Label {filt.label} not found in attribute {filt.name} for document {doc}")
-    else:
-        value = attribute_value
-
-    # Check both lower and upper bounds if specified
-    accepted = True
-    if filt.lower_threshold is not None and value < filt.lower_threshold:
-        accepted = False
-    if filt.upper_threshold is not None and value > filt.upper_threshold:
-        accepted = False
-
-    if filt.reverse:
-        accepted = not accepted
-
-    return accepted
-
-
 def _remove_spans_from_doc(doc: dict, filt: FilterConfig, attributes: dict) -> dict:
     def _remove_spans(text: str, spans: list[list[int]]) -> str:
         """Return ``text`` with ``spans`` removed.
@@ -151,93 +107,6 @@ def _make_id_extractor(corpus_type: str) -> Callable[[dict], Any]:
     return lambda r: extract_id(r, corpus_type)
 
 
-def _compute_percentile_threshold(
-    attr_paths: list[str], attr_name: str, attr_label: str | None, keep_fraction: float
-) -> float:
-    """Compute percentile threshold for a single filter using DDSketch reduction.
-
-    Args:
-        attr_paths: Paths to attribute files
-        attr_name: Name of attribute to extract
-        attr_label: Optional label within attribute (for nested dicts)
-        keep_fraction: Fraction of documents to keep (0-1)
-
-    Returns:
-        Threshold value at the (1 - keep_fraction) quantile
-    """
-    from ddsketch import DDSketch
-
-    def local_reducer(rows: Iterator[dict], attr_name: str = attr_name, attr_label: str | None = attr_label) -> DDSketch:
-        """Build DDSketch from rows in a single shard."""
-        sketch = DDSketch()
-        for row in rows:
-            attributes = row["attributes"]
-            value = attributes[attr_name][attr_label] if attr_label else attributes[attr_name]
-            sketch.add(value)
-        return sketch
-
-    def global_reducer(sketches: Iterator[DDSketch]) -> DDSketch:
-        """Merge all shard sketches into one."""
-        combined = DDSketch()
-        for sketch in sketches:
-            combined.merge(sketch)
-        return combined
-
-    ctx = ZephyrContext(name="consolidate-stats")
-    result = ctx.execute(
-        Dataset.from_list(attr_paths)
-        .load_file()
-        .select("attributes")
-        .reduce(local_reducer=local_reducer, global_reducer=global_reducer)
-    ).results
-
-    combined_sketch = next(iter(result))
-    threshold = combined_sketch.get_quantile_value(1 - keep_fraction)
-    return threshold
-
-
-def calculate_percentile_thresholds(
-    *,
-    input_path: str,
-    filters: list[FilterConfig],
-    filetype: str = "jsonl.gz",
-) -> list[FilterConfig]:
-    """Resolve ``keep_fraction`` filters to ``lower_threshold`` via percentile calculation.
-
-    Returns a new list of filters with percentile-based thresholds resolved.
-    """
-    updated_filters = []
-    input_paths = fsspec_glob(os.path.join(input_path, f"**/*.{filetype}"))
-
-    for filt in filters:
-        # Validate threshold configuration
-        if filt.keep_fraction is not None and filt.lower_threshold is not None:
-            raise ValueError("Cannot specify both keep_fraction and lower_threshold. Please specify only one.")
-
-        # Skip if no percentile calculation needed
-        if filt.keep_fraction is None:
-            updated_filters.append(filt)
-            continue
-
-        if not (0 < filt.keep_fraction < 1):
-            raise ValueError("keep_fraction must be between 0 and 1")
-
-        # Only applies to CLASSIFY filters
-        if filt.type != FilterType.CLASSIFY:
-            logger.warning(f"keep_fraction only applies to CLASSIFY filters, ignoring for {filt.name}")
-            updated_filters.append(filt)
-            continue
-
-        attr_paths = _attribute_paths_for_filter(input_path, input_paths, filt, filetype)
-        attr_paths = [p for p in attr_paths if p is not None]
-
-        threshold = _compute_percentile_threshold(attr_paths, filt.name, filt.label, filt.keep_fraction)
-        logger.info(f"Calculated threshold {threshold} for {filt.name} to keep {filt.keep_fraction} of documents")
-        updated_filters.append(replace(filt, lower_threshold=threshold, keep_fraction=None))
-
-    return updated_filters
-
-
 def _resolve_attribute_path(input_base: str, input_path: str, filt: FilterConfig, filetype: str) -> str | None:
     """Map an input file path to its attribute file path, with glob fallback for compression suffixes."""
     new_extension = f".{filt.attribute_filetype}" if filt.attribute_filetype else f".{filetype}"
@@ -288,8 +157,6 @@ def combine(left: dict, right: dict | None) -> dict | None:
             return left if filt.keep_if_missing else None
 
         attrs = right["attributes"]
-        if filt.type == FilterType.CLASSIFY:
-            return left if _is_valid(left, filt, attrs) else None
         if filt.type == FilterType.REMOVE_DOC:
             return left if not attrs.get(filt.name, False) else None
         assert filt.type == FilterType.REMOVE_SPANS
@@ -311,8 +178,7 @@ def consolidate(
 
     Joins each input file with its (co-partitioned, sorted) attribute files via
     chained ``sorted_merge_join`` ops — one left join per filter, with the
-    filter's keep/mutate/drop logic encoded in its combiner. No in-memory hash
-    table is materialized.
+    filter's keep/mutate/drop logic encoded in its combiner.
 
     Args:
         input_path: Directory (recursively) containing input documents.
@@ -321,13 +187,12 @@ def consolidate(
         filetype: Extension of the input documents (default: ``"jsonl.gz"``).
         worker_resources: Optional Zephyr worker resource config (defaults to Zephyr defaults).
     """
-    filters = calculate_percentile_thresholds(input_path=input_path, filters=filters, filetype=filetype)
     input_paths = sorted(fsspec_glob(os.path.join(input_path, f"**/*.{filetype}")))
     if not input_paths:
         raise ValueError(f"No input files matched {input_path}/**/*.{filetype}")
     logger.info(f"Consolidating {len(input_paths)} document files via {len(filters)} sorted_merge_join(s)")
 
-    # Determine id key; assume a uniform corpus across shards (matches prior per-shard behavior
+    # Determine id key; assume a uniform corpus across shards (matches prior per-shard behavior)
     # since datakit inputs are all "default" — "dclm" was the only alternative).
     corpus_type = "dclm" if any("dclm" in p for p in input_paths) else "default"
     id_key = CORPUS_TYPE_TO_ID_GUIDE[corpus_type]["key"]
diff --git a/tests/processing/classification/test_consolidate.py b/tests/processing/classification/test_consolidate.py
@@ -1,133 +1,19 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
-import gzip
-import json
 import os
 from pathlib import Path
 
-from marin.processing.classification.deduplication.dedup_commons import DedupMode
-from marin.processing.classification.deduplication.exact import dedup_exact_paragraph
-import pytest
-from ddsketch import DDSketch
 from marin.processing.classification.consolidate import (
     FilterConfig,
     FilterType,
-    calculate_percentile_thresholds,
     consolidate,
 )
+from marin.processing.classification.deduplication.dedup_commons import DedupMode
+from marin.processing.classification.deduplication.exact import dedup_exact_paragraph
 from zephyr.readers import load_parquet
 
 
-def _write_jsonl(path: Path, rows: list[dict]) -> None:
-    with path.open("w", encoding="utf-8") as handle:
-        for row in rows:
-            handle.write(json.dumps(row) + "\n")
-
-
-def test_calculate_percentile_threshold(tmp_path):
-    documents_dir = tmp_path / "documents"
-    attributes_dir = tmp_path / "attributes"
-    documents_dir.mkdir()
-    attributes_dir.mkdir()
-
-    attribute_rows = [
-        [
-            {"id": "doc-0", "attributes": {"quality": {"good": 0.1}}},
-            {"id": "doc-1", "attributes": {"quality": {"good": 0.4}}},
-        ],
-        [
-            {"id": "doc-2", "attributes": {"quality": {"good": 0.7}}},
-            {"id": "doc-3", "attributes": {"quality": {"good": 0.9}}},
-        ],
-    ]
-
-    for shard_index, rows in enumerate(attribute_rows):
-        doc_path = documents_dir / f"part-{shard_index}.jsonl"
-        doc_path.write_text("{}", encoding="utf-8")
-        attr_path = attributes_dir / f"part-{shard_index}.jsonl"
-        _write_jsonl(attr_path, rows)
-
-    keep_fraction = 0.5
-    filters = [
-        FilterConfig(
-            type=FilterType.CLASSIFY,
-            attribute_path=str(attributes_dir),
-            name="quality",
-            label="good",
-            keep_fraction=keep_fraction,
-        )
-    ]
-
-    updated_filters = calculate_percentile_thresholds(input_path=str(documents_dir), filters=filters, filetype="jsonl")
-    threshold = updated_filters[0].lower_threshold
-
-    # Calculate expected threshold
-    expected_sketch = DDSketch()
-    for shard in attribute_rows:
-        for row in shard:
-            expected_sketch.add(row["attributes"]["quality"]["good"])
-    expected_threshold = expected_sketch.get_quantile_value(1 - keep_fraction)
-
-    assert threshold == pytest.approx(expected_threshold, rel=1e-6)
-
-
-def _write_jsonl_gz(path: Path, rows: list[dict]) -> None:
-    with gzip.open(path, "wt", encoding="utf-8") as handle:
-        for row in rows:
-            handle.write(json.dumps(row) + "\n")
-
-
-def test_consolidate_filters_and_writes_output(tmp_path):
-    """Test that consolidate filters documents and writes output using zephyr."""
-    input_root = tmp_path / "input"
-    attributes_root = tmp_path / "attributes"
-    output_root = tmp_path / "output"
-    input_root.mkdir()
-    attributes_root.mkdir()
-    output_root.mkdir()
-
-    input_rows = [
-        {"id": "doc-0", "text": "first"},
-        {"id": "doc-1", "text": "second"},
-        {"id": "doc-2", "text": "third"},
-    ]
-    attribute_rows = [
-        {"id": "doc-0", "attributes": {"quality": {"good": 0.1}}},
-        {"id": "doc-1", "attributes": {"quality": {"good": 0.6}}},
-        {"id": "doc-2", "attributes": {"quality": {"good": 0.8}}},
-    ]
-
-    input_file = input_root / "part-0000.jsonl.gz"
-    attribute_file = attributes_root / "part-0000.jsonl.gz"
-    _write_jsonl_gz(input_file, input_rows)
-    _write_jsonl_gz(attribute_file, attribute_rows)
-
-    consolidate(
-        input_path=str(input_root),
-        output_path=str(output_root),
-        filters=[
-            FilterConfig(
-                type=FilterType.CLASSIFY,
-                attribute_path=str(attributes_root),
-                name="quality",
-                label="good",
-                lower_threshold=0.5,
-            )
-        ],
-    )
-
-    output_file = output_root / "part-0000.parquet"
-    assert (
-        output_file.exists()
-    ), f"Expected consolidated output file to be written. Files in {output_root}: {list(output_root.iterdir())}"
-
-    output_rows = load_parquet(str(output_file))
-
-    kept_ids = {row["id"] for row in output_rows}
-    assert kept_ids == {"doc-1", "doc-2"}, f"Expected to keep doc-1 and doc-2, but got {kept_ids}"
-
-
 def test_dedupe_consolidate_integration(fox_corpus):
     """Integration test: dedupe generates attributes, consolidate filters based on them.