Target side deduplication based on bicleaner scores (#1201)

ZJaume · web-flow · commit 74abb88acc59 · 2025-07-31T14:18:15.000-07:00
* Target side deduplication using Bicleaner scores

* Add scores.zst artifact

* bicleaner: create a dummy scores when no filtering

* Fix linting

* Yield dummy scores when scores is not provided

This allows to use merge-parallel with devsets

* Fix linting

* Rename Bicleaner step .scores.zst to best-scores

* Adapt test_merge_corpus to dedup by target

Extend the cases to check that different sentence pairs with different
sources but same target are deduplicated by target and the source with
the best scores kept.

* Move generator logic to a function
diff --git a/pipeline/bicleaner/bicleaner.sh b/pipeline/bicleaner/bicleaner.sh
@@ -34,6 +34,10 @@ if [ "${bicleaner_threshold}" == "0" ] || [ "${bicleaner_threshold}" == "0.0" ];
   echo "Threshold is 0, skipping filtering"
   cp "${corpus_prefix}.${SRC}.zst" "${output_prefix}.${SRC}.zst"
   cp "${corpus_prefix}.${TRG}.zst" "${output_prefix}.${TRG}.zst"
+  # Create a dummy best-scores.zst, if no filtering everyone gets perfect score
+  # this is needed for target side dedup in merge-parallel
+  num_sents=$(zstdcat "${corpus_prefix}.${TRG}.zst" | wc -l)
+  awk -v n=$num_sents 'BEGIN {for(i=0;i<n;i++) print "1.0";}' | zstdmt >"${output_prefix}.best-scores.zst"
 else
 
   export scol=1
@@ -98,7 +102,8 @@ else
   echo "### Writing output corpus"
   zstdmt -dc "${output_prefix}.best.zst" |
     tee >(cut -f1 | zstdmt >"${output_prefix}.${SRC}.zst") |
-    cut -f2 | zstdmt >"${output_prefix}.${TRG}.zst"
+    tee >(cut -f2 | zstdmt >"${output_prefix}.${TRG}.zst") |
+    cut -f3 | zstdmt >"${output_prefix}.best-scores.zst"
 
   # do not delete intermediate files to inspect them and tune the threshold
 fi
diff --git a/pipeline/clean/merge-parallel.py b/pipeline/clean/merge-parallel.py
@@ -22,7 +22,7 @@
 from pipeline.common.datasets import (
     FilteringStep,
     Statistics,
-    WeakStringSet,
+    WeakStringDict,
     shuffle_with_max_lines,
 )
 from pipeline.common.downloads import get_human_readable_file_size, read_lines, write_lines
@@ -58,17 +58,24 @@ def log_dataset(location: str):
     logger.info(f"Reading dataset {location}")
 
 
+def dummy_score_generator():
+    for i in iter(int, 1):
+        yield "1.0"
+
+
 class DeduplicateCorpus:
     def __init__(
         self,
         datasets_src: list[Path],
         datasets_trg: list[Path],
+        datasets_scores: list[Path],
         src_outpath: Path,
         trg_outpath: Path,
         stats: FilteringStatistics,
     ) -> None:
         self.datasets_src: list[Path] = datasets_src
         self.datasets_trg: list[Path] = datasets_trg
+        self.datasets_scores: list[Path] = datasets_scores
         self.src_outpath: Path = src_outpath
         self.trg_outpath: Path = trg_outpath
         self.stats: FilteringStatistics = stats
@@ -105,30 +112,63 @@ def run(
                 stats.final_truncated.kept = stats.parallel_corpus.kept
                 stats.final_truncated.visited = stats.parallel_corpus.kept
 
-    def yield_lines_tuple(self, stack: ExitStack) -> Generator[tuple[str, str], None, None]:
-        strings_seen = WeakStringSet()
-        stats = self.stats
+    def on_enter_location(self, location):
+        log_dataset(location)
+        self.dataset_stats = self.stats.add_parallel_dataset(location)
+
+    def _yield_lines(self, stack: ExitStack, add_stats: bool = False):
+        if add_stats:
+            enter_location_func = self.on_enter_location
+        else:
+            enter_location_func = log_dataset
+
         src_lines: Generator[str, None, None] = stack.enter_context(
-            read_lines(self.datasets_src, on_enter_location=self.on_enter_location)
+            read_lines(self.datasets_src, on_enter_location=enter_location_func)
         )
         trg_lines: Generator[str, None, None] = stack.enter_context(
             read_lines(self.datasets_trg, on_enter_location=log_dataset)
         )
+        if self.datasets_scores == []:
+            logger.info("No scores found, deduping without score")
+            scores_lines = dummy_score_generator()
+        else:
+            scores_lines: Generator[str, None, None] = stack.enter_context(
+                read_lines(self.datasets_scores, on_enter_location=log_dataset)
+            )
 
-        for src_line, trg_line in zip(src_lines, trg_lines):
-            # No separator is needed as the newline is included.
-            line = src_line + trg_line
+        for i, (src_line, trg_line, score_line) in enumerate(
+            zip(src_lines, trg_lines, scores_lines)
+        ):
+            try:
+                score = float(score_line)
+            except ValueError as e:
+                raise ValueError(f"Could not parse score in line {i}") from e
 
-            if line in strings_seen:
-                stats.parallel_corpus.filtered += 1
-                self.dataset_stats.filtered += 1
-            else:
+            yield src_line, trg_line, score
+
+    def yield_lines_tuple(self, stack: ExitStack) -> Generator[tuple[str, str], None, None]:
+        strings_seen = WeakStringDict()
+        stats = self.stats
+        for src_line, trg_line, score in self._yield_lines(stack):
+            # store all possible targets
+            # for all the sentence pairs that have the same target, keep the best score
+            if trg_line not in strings_seen or strings_seen[trg_line] < score:
+                strings_seen[trg_line] = score
+
+        for src_line, trg_line, score in self._yield_lines(stack, add_stats=True):
+            # When a target has the same score as stored, therefore the best score
+            # we keep it
+            if trg_line in strings_seen and strings_seen[trg_line] == score:
                 stats.parallel_corpus.kept += 1
                 self.dataset_stats.kept += 1
-
-                strings_seen.add(line)
+                # the item is removed from the dict to avoid keeping two sentence pairs
+                # that have the same target AND the same score
+                del strings_seen[trg_line]
 
                 yield src_line, trg_line
+            else:
+                stats.parallel_corpus.filtered += 1
+                self.dataset_stats.filtered += 1
 
     def yield_lines_string(self, stack: ExitStack) -> Generator[str, None, None]:
         for src_line, trg_line in self.yield_lines_tuple(stack):
@@ -139,10 +179,6 @@ def yield_lines_string(self, stack: ExitStack) -> Generator[str, None, None]:
             else:
                 yield f"{src_line}\t{trg_line}"
 
-    def on_enter_location(self, location):
-        log_dataset(location)
-        self.dataset_stats = self.stats.add_parallel_dataset(location)
-
 
 def sample_corpus(
     artifacts: Path, name: str, sample_size: int, src_outpath: Path, trg_outpath: Path
@@ -204,24 +240,43 @@ def get_datasets(src: str, trg: str, datasets_glob: str):
     dataset_paths: list[str] = glob(datasets_glob)
     datasets_src: list[Path] = []
     datasets_trg: list[Path] = []
+    datasets_scores: list[Path] = []
     dataset_paths.sort()
 
     total_corpus_bytes = 0
 
     for dataset in dataset_paths:
         path = Path(dataset)
+        countbytes = True
         if dataset.endswith(f"{src}.zst"):
             datasets_src.append(path)
         elif dataset.endswith(f"{trg}.zst"):
             datasets_trg.append(path)
+        elif dataset.endswith(".best-scores.zst"):
+            datasets_scores.append(path)
+            countbytes = False
         else:
             raise Exception(f"Dataset does not match naming scheme: {dataset}")
 
-        formatted_size, bytes = get_human_readable_file_size(path)
-        logger.info(f" - {path} ({formatted_size})")
-        total_corpus_bytes += bytes
+        # Do not count bytes of the scores
+        if countbytes:
+            formatted_size, bytes = get_human_readable_file_size(path)
+            logger.info(f" - {path} ({formatted_size})")
+            total_corpus_bytes += bytes
+
+    # Fail if different amount of files per dataset
+    # but do not file if no .scores are provided (when running for devsets)
+    if (
+        len(datasets_src) != len(datasets_trg) or len(datasets_src) != len(datasets_scores)
+    ) and datasets_scores != []:
+        logger.info(datasets_src)
+        logger.info(datasets_trg)
+        logger.info(datasets_scores)
+        raise Exception(
+            f"Number of files per dataset is different src: {len(datasets_src)} trg: {len(datasets_trg)} scores: {len(datasets_scores)}"
+        )
 
-    return datasets_src, datasets_trg, total_corpus_bytes
+    return datasets_src, datasets_trg, datasets_scores, total_corpus_bytes
 
 
 def main() -> None:
@@ -273,7 +328,7 @@ def main() -> None:
 
     args = parser.parse_args()
 
-    datasets_src, datasets_trg, total_corpus_bytes = get_datasets(
+    datasets_src, datasets_trg, datasets_scores, total_corpus_bytes = get_datasets(
         args.src, args.trg, args.datasets_glob
     )
 
@@ -291,6 +346,7 @@ def main() -> None:
     deduplicate_corpus = DeduplicateCorpus(
         datasets_src,
         datasets_trg,
+        datasets_scores,
         src_outpath,
         trg_outpath,
         stats,
diff --git a/pipeline/common/datasets.py b/pipeline/common/datasets.py
@@ -9,7 +9,7 @@
 from io import TextIOWrapper
 from pathlib import Path
 from random import Random
-from typing import Callable, Iterator, Literal, Optional, Set, Union
+from typing import Callable, Iterator, Literal, Optional, Set, Union, Dict
 from urllib.parse import urlparse
 import unicodedata
 
@@ -448,6 +448,57 @@ def _hash_string(string: str) -> int:
         return hash(cleaned_line)
 
 
+class WeakStringDict(Dict):
+    """
+    A Dict that weakly holds on to key strings by storing a hashed `int`. Using this class
+    makes it easy to see if a string is duplicated across large datasets without holding
+    the entire set of strings in memory.
+
+    This is an alternate version of WeakStringSet that also stores a float value (score)
+    associated to the string.
+
+    Usage:
+        unique_strings = WeakStringDict()
+        unique_strings["string a"] = 0.78
+        unique_strings["string b"] = 0.911
+
+        assert "string a" in unique_strings
+        assert "string b" in unique_strings
+        assert "string c" not in unique_strings
+    """
+
+    def __init__(self, iter: Optional[Iterable[str]] = None) -> None:
+        if iter:
+            super().__init__((WeakStringDict._hash_string(string) for string in iter))
+        else:
+            super().__init__()
+
+    def __contains__(self, string: str) -> bool:
+        return super().__contains__(WeakStringDict._hash_string(string))
+
+    def __setitem__(self, string: str, val: float) -> None:
+        """
+        Add/set a string the weak dict as key and its value associated.
+        The strings are stored uniquely based on their
+        contents with the whitespace surrounding them stripped.
+        """
+        super().__setitem__(WeakStringDict._hash_string(string), val)
+
+    def __delitem__(self, string: str):
+        super().__delitem__(WeakStringDict._hash_string(string))
+
+    def __getitem__(self, string: str) -> float:
+        return super().__getitem__(WeakStringDict._hash_string(string))
+
+    def _hash_string(string: str) -> int:
+        """
+        Return a hash of a line. The line has its whitespace stripped and text representation
+        normalized to ensure a consistent representation.
+        """
+        cleaned_line = unicodedata.normalize("NFC", string.strip())
+        return hash(cleaned_line)
+
+
 def decompress(
     source: Union[str, Path],
     destination: Optional[Union[Path, str]] = None,
diff --git a/taskcluster/kinds/corpus-merge-parallel/kind.yml b/taskcluster/kinds/corpus-merge-parallel/kind.yml
@@ -78,6 +78,7 @@ tasks:
             upstream-artifacts:
                 - "{dataset_sanitized}.{src_locale}.zst"
                 - "{dataset_sanitized}.{trg_locale}.zst"
+                - "{dataset_sanitized}.best-scores.zst"
             upstream-task-attributes:
                 cleaning-type:
                     by-cleaning-type:
diff --git a/tests/test_common_datasets.py b/tests/test_common_datasets.py
@@ -9,6 +9,7 @@
 from pipeline.common.logging import get_logger
 from pipeline.common.datasets import (
     WeakStringSet,
+    WeakStringDict,
     compress,
     decompress,
     shuffle_in_temp_files,
@@ -218,6 +219,31 @@ def test_weak_string_set():
     assert len(unique_strings2) == 2
 
 
+def test_weak_string_dict():
+    unique_strings_scores = WeakStringDict()
+    unique_strings_scores["aa"] = 0.87
+    unique_strings_scores["aa"] = 0.92
+    unique_strings_scores["ab"] = 4.1
+
+    assert "aa" in unique_strings_scores
+    assert "ab" in unique_strings_scores
+    assert unique_strings_scores["aa"] == 0.92
+    assert unique_strings_scores["aa"] != 0.87
+
+    del unique_strings_scores["aa"]
+    assert "aa" not in unique_strings_scores
+
+    assert len(unique_strings_scores) == 1
+
+    unique_strings_scores["cdf"] = 33.2
+    assert "cdf" in unique_strings_scores
+    assert unique_strings_scores["cdf"] == 33.2
+    unique_strings_scores["aa"] = 0.33
+    unique_strings_scores["ab"] = 0.34
+    unique_strings_scores["aa"] = 0.01
+    assert unique_strings_scores["aa"] == 0.01
+
+
 @pytest.mark.parametrize("suffix", ["zst", "gz"])
 @pytest.mark.parametrize("remove_or_keep", ["remove", "keep"])
 def test_compress(suffix: str, remove_or_keep: str):
diff --git a/tests/test_merge_corpus.py b/tests/test_merge_corpus.py