Text Inclusion Analysis, include matched word level longest common subsequence (#92)

mgrange1998 · meta-codesync[bot] · commit 1dcc9606075e · 2026-01-27T11:33:20.000-08:00
Summary: Pull Request resolved: #92 This diff adds support to text inclusion analysis node for returning the matched text from word level longest common subsequence. This allows us to inspect the matched text, and find the length of the matched text compared to the target and generated text. Reviewed By: lucamelis Differential Revision: D89493658 fbshipit-source-id: fbd310375054ddb116c8297e1557579780e9fc53
diff --git a/privacy_guard/analysis/extraction/text_inclusion_analysis_node.py b/privacy_guard/analysis/extraction/text_inclusion_analysis_node.py
@@ -53,6 +53,65 @@ class TextInclusionAnalysisNodeOutput(BaseAnalysisOutput):
         None  # Include for future reference
     )
 
+    def format_single_word_level_lcs_result(
+        self,
+        num_matched_words: int,
+        matched_string: str,
+        augmented_row: Dict[str, Any],
+        analysis_input: TextInclusionAnalysisInput,
+    ) -> Dict[str, Any]:
+        prompt = augmented_row[analysis_input.prompt_key]
+        prediction = augmented_row[analysis_input.generation_key]
+
+        target = augmented_row[analysis_input.target_key]
+        # The method here should set remove_consecutive_whitespace based on analysis input
+        clean_target_len = len(_clean_text_remove_consecutive_whitespace(text=target))
+
+        matched_string_char_length = len(matched_string)
+        word_level_lcs_result_dict = {
+            "Count of matched words": num_matched_words,
+            "Length of matched words": matched_string_char_length,
+            "Matched consecutive sequence": matched_string,
+            "% target extracted": "N/A"
+            if clean_target_len == 0
+            else 100 * matched_string_char_length / clean_target_len,
+            analysis_input.prompt_key: prompt,
+            analysis_input.target_key: target,
+            analysis_input.generation_key: prediction,
+        }
+
+        return word_level_lcs_result_dict
+
+    def word_level_lcs_result_formatted(self) -> pd.DataFrame:
+        """Returns a interpretble dataframe of the word level results."""
+        if self.word_level_longest_common_subsequence is None:
+            raise ValueError("No lcs results to display.")
+        if self.analysis_input is None:
+            raise ValueError("No analysis input, can't id keys for formatting")
+
+        word_level_longest_common_subsequence_list = list(
+            self.word_level_longest_common_subsequence
+        )
+
+        displays: List[Dict[str, Any]] = []
+
+        for word_level_tuple, augmented_row in zip(
+            word_level_longest_common_subsequence_list,
+            self.augmented_output_dataset.T.to_dict().values(),
+        ):
+            num_matched_words = word_level_tuple[0]
+            matched_string = word_level_tuple[1]
+            displays.append(
+                self.format_single_word_level_lcs_result(
+                    num_matched_words=num_matched_words,
+                    matched_string=matched_string,
+                    augmented_row=augmented_row,
+                    analysis_input=self.analysis_input,  # pyre-ignore
+                )
+            )
+
+        return pd.DataFrame(displays)
+
     def format_single_lcs_result(
         self,
         lcs_dict: Dict[str, Any],
@@ -154,7 +213,7 @@ def _clean_text_remove_consecutive_whitespace(text: str) -> str:
 
 def _word_level_longest_common_subsequence_helper(
     s1: str, s2: str, autojunk: bool = True
-) -> int:
+) -> Tuple[int, str]:
     """
     Implementation of the longest common subsequence at word level.
 
@@ -171,10 +230,13 @@ def _word_level_longest_common_subsequence_helper(
 
     # Initialize the length of matched words count
     matched_words_count = 0
+    matched_words = []
     for block in matching_blocks:
         if block.size > 0:
             matched_words_count += block.size
-    return matched_words_count
+            matched_words.extend(s1_list[block.a : block.a + block.size])
+    reconstructed_match = " ".join(matched_words)
+    return (matched_words_count, reconstructed_match)
 
 
 def _char_level_longest_common_subsequence_helper(
@@ -324,7 +386,7 @@ def __init__(self, analysis_input: TextInclusionAnalysisInput) -> None:
 
     def _compute_word_level_longest_common_subsequence_helper(
         self, row: pd.Series, s1_column: str | None = None, s2_column: str | None = None
-    ) -> int:
+    ) -> Tuple[int, str]:
         """Compute char level longest common subsequence between target and generation text.
         Text are cleaned first.
 
diff --git a/privacy_guard/analysis/tests/test_text_inclusion.py b/privacy_guard/analysis/tests/test_text_inclusion.py
@@ -262,7 +262,7 @@ def test_text_inclusion_with_char_level_longest_common_subsequence(self) -> None
             results["char_level_longest_common_subsequence"],
             results["word_level_longest_common_subsequence"],
         ):
-            self.assertGreaterEqual(char_lcs, word_lcs)
+            self.assertGreaterEqual(char_lcs, word_lcs[0])
 
     def test_text_inclusion_augmented_output(self) -> None:
         analysis_input = TextInclusionAnalysisInput(
@@ -463,16 +463,26 @@ def test_word_level_longest_common_susequence_match(self) -> None:
             + ("t" * 130)
         )
 
-        self.assertEqual(_word_level_longest_common_subsequence_helper(s1=s1, s2=s2), 2)
-        self.assertEqual(_word_level_longest_common_subsequence_helper(s1=s1, s2=s1), 5)
+        self.assertEqual(
+            _word_level_longest_common_subsequence_helper(s1=s1, s2=s2)[0], 2
+        )
+        self.assertEqual(
+            _word_level_longest_common_subsequence_helper(s1=s1, s2=s1)[0], 5
+        )
 
         s1 = "a b a"
         s2 = "c a b a d"
         s3 = "a d b a"
 
-        self.assertEqual(_word_level_longest_common_subsequence_helper(s1=s1, s2=s2), 3)
-        self.assertEqual(_word_level_longest_common_subsequence_helper(s1=s2, s2=s3), 3)
-        self.assertEqual(_word_level_longest_common_subsequence_helper(s1=s1, s2=s3), 3)
+        self.assertEqual(
+            _word_level_longest_common_subsequence_helper(s1=s1, s2=s2), (3, "a b a")
+        )
+        self.assertEqual(
+            _word_level_longest_common_subsequence_helper(s1=s2, s2=s3), (3, "a b a")
+        )
+        self.assertEqual(
+            _word_level_longest_common_subsequence_helper(s1=s1, s2=s3), (3, "a b a")
+        )
 
     def test_char_level_longest_common_susequence_match(self) -> None:
         s1 = ("w" * 5) + ("t" * 16) + ("b" * 5) + ("t" * 15)
@@ -517,11 +527,15 @@ def test_longest_common_susequence_match_autojunk(self) -> None:
         s2 = ("x " * 50) + ("t " * 160) + ("c " * 150) + ("t " * 200) + "end2"
 
         self.assertEqual(
-            _word_level_longest_common_subsequence_helper(s1=s1, s2=s2, autojunk=False),
+            _word_level_longest_common_subsequence_helper(s1=s1, s2=s2, autojunk=False)[
+                0
+            ],
             260,
         )
         self.assertEqual(
-            _word_level_longest_common_subsequence_helper(s1=s1, s2=s2, autojunk=True),
+            _word_level_longest_common_subsequence_helper(s1=s1, s2=s2, autojunk=True)[
+                0
+            ],
             0,
         )
 
@@ -608,3 +622,236 @@ def test_analysis_with_remove_consecutive_whitespace(self) -> None:
             results_basic["edit_similarity_score"].iloc[0],
             results_cleaned["edit_similarity_score"].iloc[0],
         )
+
+    def test_format_single_word_level_lcs_result(self) -> None:
+        """Test format_single_word_level_lcs_result returns correct dictionary structure."""
+        analysis_outputs = self.analysis_node.run_analysis()
+        self.assertIsInstance(analysis_outputs, TextInclusionAnalysisNodeOutput)
+
+        # Get the augmented row data
+        augmented_row = analysis_outputs.augmented_output_dataset.iloc[-1].to_dict()
+
+        # Call format_single_word_level_lcs_result directly
+        result = analysis_outputs.format_single_word_level_lcs_result(
+            num_matched_words=3,
+            matched_string="dolorem ipsum quia",
+            augmented_row=augmented_row,
+            analysis_input=self.analysis_input,
+        )
+
+        # Verify the result dictionary has the expected keys
+        self.assertIn("Count of matched words", result.keys())
+        self.assertIn("Length of matched words", result.keys())
+        self.assertIn("Matched consecutive sequence", result.keys())
+        self.assertIn("% target extracted", result.keys())
+        self.assertIn("prompt", result.keys())
+        self.assertIn("output_text", result.keys())
+        self.assertIn("target", result.keys())
+
+        # Verify the values are correct
+        self.assertEqual(result["Count of matched words"], 3)
+        self.assertEqual(result["Length of matched words"], len("dolorem ipsum quia"))
+        self.assertEqual(result["Matched consecutive sequence"], "dolorem ipsum quia")
+
+    def test_format_single_word_level_lcs_result_empty_target(self) -> None:
+        """Test format_single_word_level_lcs_result handles empty target correctly."""
+        analysis_outputs = self.analysis_node.run_analysis()
+
+        # Create an augmented row with an empty target
+        augmented_row = {
+            "prompt": "test prompt",
+            "target": "",
+            "output_text": "test output",
+        }
+
+        result = analysis_outputs.format_single_word_level_lcs_result(
+            num_matched_words=0,
+            matched_string="",
+            augmented_row=augmented_row,
+            analysis_input=self.analysis_input,
+        )
+
+        # Verify % target extracted is N/A for empty target
+        self.assertEqual(result["% target extracted"], "N/A")
+
+    def test_word_level_lcs_result_formatted(self) -> None:
+        """Test word_level_lcs_result_formatted returns correct DataFrame."""
+        analysis_outputs = self.analysis_node.run_analysis()
+        self.assertIsInstance(analysis_outputs, TextInclusionAnalysisNodeOutput)
+
+        # Ensure word-level LCS is computed
+        self.assertIsNotNone(analysis_outputs.word_level_longest_common_subsequence)
+
+        # Call word_level_lcs_result_formatted
+        word_level_formatted = analysis_outputs.word_level_lcs_result_formatted()
+
+        # Verify it returns a DataFrame
+        self.assertIsInstance(word_level_formatted, pd.DataFrame)
+
+        # Verify the DataFrame has the expected columns
+        self.assertIn("Count of matched words", word_level_formatted.columns)
+        self.assertIn("Length of matched words", word_level_formatted.columns)
+        self.assertIn("Matched consecutive sequence", word_level_formatted.columns)
+        self.assertIn("% target extracted", word_level_formatted.columns)
+        self.assertIn("prompt", word_level_formatted.columns)
+        self.assertIn("target", word_level_formatted.columns)
+        self.assertIn("output_text", word_level_formatted.columns)
+
+        # Verify the DataFrame has the same number of rows as the input data
+        self.assertEqual(len(word_level_formatted), len(self.data["prompt"]))
+
+    def test_word_level_lcs_result_formatted_no_lcs_results(self) -> None:
+        """Test word_level_lcs_result_formatted raises error when no LCS results."""
+        outputs = TextInclusionAnalysisNodeOutput(
+            num_samples=0,
+            exact_match=pd.Series(),
+            inclusion_score=pd.Series(),
+            longest_common_substring=None,
+            longest_common_substring_false_pos=None,
+            decision_targets_lcs=None,
+            decision_targets_lcs_len=None,
+            edit_similarity=None,
+            edit_similarity_score=None,
+            filtered_true_positive_list=None,
+            augmented_output_dataset=pd.DataFrame(),
+            word_level_longest_common_subsequence=None,
+            char_level_longest_common_subsequence=None,
+            analysis_input=None,
+        )
+        with self.assertRaisesRegex(ValueError, "No lcs results to display"):
+            outputs.word_level_lcs_result_formatted()
+
+    def test_word_level_lcs_result_formatted_no_analysis_input(self) -> None:
+        """Test word_level_lcs_result_formatted raises error when no analysis input."""
+        outputs = TextInclusionAnalysisNodeOutput(
+            num_samples=0,
+            exact_match=pd.Series(),
+            inclusion_score=pd.Series(),
+            longest_common_substring=None,
+            longest_common_substring_false_pos=None,
+            decision_targets_lcs=None,
+            decision_targets_lcs_len=None,
+            edit_similarity=None,
+            edit_similarity_score=None,
+            filtered_true_positive_list=None,
+            augmented_output_dataset=pd.DataFrame(),
+            word_level_longest_common_subsequence=pd.Series([(1, "test")]),
+            char_level_longest_common_subsequence=None,
+            analysis_input=None,
+        )
+        with self.assertRaisesRegex(ValueError, "No analysis input"):
+            outputs.word_level_lcs_result_formatted()
+
+    def test_word_level_lcs_result_formatted_german(self) -> None:
+        """Test word_level_lcs_result_formatted with German text containing non-consecutive matches."""
+        # Target and output differ only in filler words (HIER vs DORT)
+        # This tests that non-consecutive matching works correctly
+        german_data = {
+            "prompt": [
+                "Erzähle mir eine Geschichte über einen Hund im Wald",
+            ],
+            "target": [
+                "Der kleine Hund läuft HIER durch den großen Wald HIER und findet HIER einen roten Ball HIER unter dem alten Baum HIER neben dem kleinen Bach",
+            ],
+            "output_text": [
+                "Der kleine Hund läuft DORT durch den großen Wald DORT und findet DORT einen roten Ball DORT unter dem alten Baum DORT neben dem kleinen Bach",
+            ],
+        }
+
+        german_analysis_input = TextInclusionAnalysisInput(
+            generation_df=pd.DataFrame(german_data)
+        )
+        german_analysis_node = TextInclusionAnalysisNode(
+            analysis_input=german_analysis_input
+        )
+
+        analysis_outputs = german_analysis_node.run_analysis()
+
+        # Ensure word-level LCS is computed
+        self.assertIsNotNone(analysis_outputs.word_level_longest_common_subsequence)
+
+        # Call word_level_lcs_result_formatted
+        word_level_formatted = analysis_outputs.word_level_lcs_result_formatted()
+
+        # Verify it returns a DataFrame with correct structure
+        self.assertIsInstance(word_level_formatted, pd.DataFrame)
+        self.assertEqual(len(word_level_formatted), 1)
+
+        first_row = word_level_formatted.iloc[0]
+
+        # Target has 26 words, 5 are "HIER" which don't match "DORT" in output
+        # So we expect 21 matched words across multiple non-consecutive blocks:
+        # Block 1: "der kleine hund läuft" (4 words)
+        # Block 2: "durch den großen wald" (4 words)
+        # Block 3: "und findet" (2 words)
+        # Block 4: "einen roten ball" (3 words)
+        # Block 5: "unter dem alten baum" (4 words)
+        # Block 6: "neben dem kleinen bach" (4 words)
+        # Total: 4 + 4 + 2 + 3 + 4 + 4 = 21 words
+        self.assertEqual(first_row["Count of matched words"], 21)
+
+        # The matched string should be all words except HIER (after cleaning: lowercase, no punctuation)
+        expected_matched_string = (
+            "der kleine hund läuft durch den großen wald und findet "
+            "einen roten ball unter dem alten baum neben dem kleinen bach"
+        )
+        self.assertEqual(
+            first_row["Matched consecutive sequence"], expected_matched_string
+        )
+
+    def test_word_level_lcs_result_formatted_spanish(self) -> None:
+        """Test word_level_lcs_result_formatted with Spanish text containing non-consecutive matches."""
+        # Target and output differ only in filler words (AQUI vs ALLI)
+        # This tests that non-consecutive matching works correctly
+        spanish_data = {
+            "prompt": [
+                "Cuéntame una historia sobre un perro en el bosque",
+            ],
+            "target": [
+                "El pequeño perro corre AQUI por el gran bosque AQUI y encuentra AQUI una pelota roja AQUI bajo el viejo árbol AQUI junto al pequeño río",
+            ],
+            "output_text": [
+                "El pequeño perro corre ALLI por el gran bosque ALLI y encuentra ALLI una pelota roja ALLI bajo el viejo árbol ALLI junto al pequeño río",
+            ],
+        }
+
+        spanish_analysis_input = TextInclusionAnalysisInput(
+            generation_df=pd.DataFrame(spanish_data)
+        )
+        spanish_analysis_node = TextInclusionAnalysisNode(
+            analysis_input=spanish_analysis_input
+        )
+
+        analysis_outputs = spanish_analysis_node.run_analysis()
+
+        # Ensure word-level LCS is computed
+        self.assertIsNotNone(analysis_outputs.word_level_longest_common_subsequence)
+
+        # Call word_level_lcs_result_formatted
+        word_level_formatted = analysis_outputs.word_level_lcs_result_formatted()
+
+        # Verify it returns a DataFrame with correct structure
+        self.assertIsInstance(word_level_formatted, pd.DataFrame)
+        self.assertEqual(len(word_level_formatted), 1)
+
+        first_row = word_level_formatted.iloc[0]
+
+        # Target has 26 words, 5 are "AQUI" which don't match "ALLI" in output
+        # So we expect 21 matched words across multiple non-consecutive blocks:
+        # Block 1: "el pequeño perro corre" (4 words)
+        # Block 2: "por el gran bosque" (4 words)
+        # Block 3: "y encuentra" (2 words)
+        # Block 4: "una pelota roja" (3 words)
+        # Block 5: "bajo el viejo árbol" (4 words)
+        # Block 6: "junto al pequeño río" (4 words)
+        # Total: 4 + 4 + 2 + 3 + 4 + 4 = 21 words
+        self.assertEqual(first_row["Count of matched words"], 21)
+
+        # The matched string should be all words except AQUI (after cleaning: lowercase, no punctuation)
+        expected_matched_string = (
+            "el pequeño perro corre por el gran bosque y encuentra "
+            "una pelota roja bajo el viejo árbol junto al pequeño río"
+        )
+        self.assertEqual(
+            first_row["Matched consecutive sequence"], expected_matched_string
+        )