Skip to content

Commit 6899139

Browse files
mgrange1998facebook-github-bot
authored andcommitted
rename "disable_lcs" to "disable_longest_common_substring" and include longest_common_subsequence metrics to TextInclusionNode (#66)
Summary: Pull Request resolved: #66 - rename "disable_lcs" to "disable_longest_common_substring" - Add _compute_char_level_longest_common_subsequence_helper and _compute_word_level_longest_common_subsequence_helper options to TextInclusionAnalysisNode and input Reviewed By: s-huu Differential Revision: D83155383 fbshipit-source-id: 9ad3a68aa21038947480693fc6dc2590a9ceddf2
1 parent 77d9948 commit 6899139

3 files changed

Lines changed: 104 additions & 4 deletions

File tree

privacy_guard/analysis/extraction/text_inclusion_analysis_input.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,11 @@ def __init__(
4040
target_key: str = "target",
4141
generation_key: str = "output_text",
4242
disable_exact_match: bool = False,
43-
disable_lcs: bool = False,
43+
disable_longest_common_substring: bool = False,
4444
disable_similarity: bool = False,
4545
lcs_bound_config: LCSBoundConfig | None = None,
46+
disable_word_level_longest_common_subsequence: bool = False,
47+
disable_char_level_longest_common_subsequence: bool = True,
4648
) -> None:
4749
columns = generation_df.columns.tolist()
4850
assert (
@@ -60,10 +62,17 @@ def __init__(
6062
self.generation_key = generation_key
6163

6264
self.disable_exact_match = disable_exact_match
63-
self.disable_lcs = disable_lcs
65+
self.disable_longest_common_substring = disable_longest_common_substring
6466
self.disable_similarity = disable_similarity
6567
self.lcs_bound_config = lcs_bound_config
6668

69+
self.disable_word_level_longest_common_subsequence = (
70+
disable_word_level_longest_common_subsequence
71+
)
72+
self.disable_char_level_longest_common_subsequence = (
73+
disable_char_level_longest_common_subsequence
74+
)
75+
6776
super().__init__(df_train_user=generation_df, df_test_user=pd.DataFrame())
6877

6978
@property

privacy_guard/analysis/extraction/text_inclusion_analysis_node.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ class TextInclusionAnalysisNodeOutput(BaseAnalysisOutput):
4141
edit_similarity_score: Optional[pd.Series]
4242
filtered_true_positive_list: list[str] | None
4343
augmented_output_dataset: pd.DataFrame
44+
char_level_longest_common_subsequence: Optional[pd.Series]
45+
word_level_longest_common_subsequence: Optional[pd.Series]
4446

4547

4648
def _clean_text(text: str) -> str:
@@ -219,6 +221,32 @@ def __init__(self, analysis_input: TextInclusionAnalysisInput) -> None:
219221

220222
super().__init__(analysis_input=analysis_input)
221223

224+
def _compute_word_level_longest_common_subsequence_helper(
225+
self, row: pd.Series, s1_column: str | None = None, s2_column: str | None = None
226+
) -> int:
227+
"""Compute char level longest common subsequence between target and generation text.
228+
Text are cleaned first.
229+
230+
Returns:
231+
int: Number of shared words between the two strings.
232+
"""
233+
s1 = _clean_text(row[s1_column or self.target_key])
234+
s2 = _clean_text(row[s2_column or self.generation_key])
235+
return _word_level_longest_common_subsequence_helper(s1, s2)
236+
237+
def _compute_char_level_longest_common_subsequence_helper(
238+
self, row: pd.Series, s1_column: str | None = None, s2_column: str | None = None
239+
) -> int:
240+
"""Compute word level longest common subsequence between target and generation text.
241+
Text are cleaned first.
242+
243+
Returns:
244+
int: Number of shared words between the two strings.
245+
"""
246+
s1 = _clean_text(row[s1_column or self.target_key])
247+
s2 = _clean_text(row[s2_column or self.generation_key])
248+
return _char_level_longest_common_subsequence_helper(s1, s2)
249+
222250
def _compute_edit_similarity(
223251
self, row: pd.Series, s1_column: str | None = None, s2_column: str | None = None
224252
) -> int:
@@ -389,9 +417,11 @@ def run_analysis(self) -> TextInclusionAnalysisNodeOutput:
389417
edit_similarity_score=None,
390418
filtered_true_positive_list=None,
391419
augmented_output_dataset=generation_df,
420+
word_level_longest_common_subsequence=None,
421+
char_level_longest_common_subsequence=None,
392422
)
393423

394-
if not analysis_input.disable_lcs:
424+
if not analysis_input.disable_longest_common_substring:
395425
# Longest common substring
396426

397427
lcs_result = generation_df.progress_apply(
@@ -425,4 +455,26 @@ def run_analysis(self) -> TextInclusionAnalysisNodeOutput:
425455
outputs.edit_similarity = generation_df["edit_similarity"]
426456
outputs.edit_similarity_score = generation_df["edit_similarity_score"]
427457

458+
if not analysis_input.disable_word_level_longest_common_subsequence:
459+
generation_df["word_level_longest_common_subsequence"] = (
460+
generation_df.progress_apply(
461+
self._compute_word_level_longest_common_subsequence_helper, axis=1
462+
)
463+
)
464+
465+
outputs.word_level_longest_common_subsequence = generation_df[
466+
"word_level_longest_common_subsequence"
467+
]
468+
469+
if not analysis_input.disable_char_level_longest_common_subsequence:
470+
generation_df["char_level_longest_common_subsequence"] = (
471+
generation_df.progress_apply(
472+
self._compute_char_level_longest_common_subsequence_helper, axis=1
473+
)
474+
)
475+
476+
outputs.char_level_longest_common_subsequence = generation_df[
477+
"char_level_longest_common_subsequence"
478+
]
479+
428480
return outputs

privacy_guard/analysis/tests/test_text_inclusion.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def test_output_types(self) -> None:
179179

180180
def test_text_inclusion_no_lcs(self) -> None:
181181
analysis_input = TextInclusionAnalysisInput(
182-
generation_df=pd.DataFrame(self.data), disable_lcs=True
182+
generation_df=pd.DataFrame(self.data), disable_longest_common_substring=True
183183
)
184184
analysis_node = TextInclusionAnalysisNode(analysis_input=analysis_input)
185185

@@ -199,6 +199,9 @@ def test_text_inclusion_no_lcs(self) -> None:
199199
self.assertIsNotNone(results["edit_similarity"], None)
200200
self.assertIsNotNone(results["edit_similarity_score"], None)
201201

202+
self.assertIsNone(results["char_level_longest_common_subsequence"])
203+
self.assertIsNotNone(results["word_level_longest_common_subsequence"])
204+
202205
def test_text_inclusion_no_similarity(self) -> None:
203206
analysis_input = TextInclusionAnalysisInput(
204207
generation_df=pd.DataFrame(self.data), disable_similarity=True
@@ -219,6 +222,40 @@ def test_text_inclusion_no_similarity(self) -> None:
219222
self.assertEqual(results["edit_similarity"], None)
220223
self.assertEqual(results["edit_similarity_score"], None)
221224

225+
self.assertIsNone(results["char_level_longest_common_subsequence"])
226+
self.assertIsNotNone(results["word_level_longest_common_subsequence"])
227+
228+
def test_text_inclusion_with_char_level_longest_common_subsequence(self) -> None:
229+
analysis_input = TextInclusionAnalysisInput(
230+
generation_df=pd.DataFrame(self.data),
231+
disable_char_level_longest_common_subsequence=False,
232+
disable_word_level_longest_common_subsequence=False,
233+
)
234+
analysis_node = TextInclusionAnalysisNode(analysis_input=analysis_input)
235+
236+
results = analysis_node.compute_outputs()
237+
238+
self.assertIn("exact_match", results)
239+
240+
self.assertIn("inclusion_score", results)
241+
242+
self.assertIn("longest_common_substring", results)
243+
self.assertIn("decision_targets_lcs", results)
244+
self.assertIsNotNone(results["longest_common_substring"])
245+
self.assertIsNotNone(results["decision_targets_lcs"])
246+
247+
self.assertIsNotNone(results["edit_similarity"])
248+
self.assertIsNotNone(results["edit_similarity_score"])
249+
250+
self.assertIsNotNone(results["char_level_longest_common_subsequence"])
251+
self.assertIsNotNone(results["word_level_longest_common_subsequence"])
252+
253+
for char_lcs, word_lcs in zip(
254+
results["char_level_longest_common_subsequence"],
255+
results["word_level_longest_common_subsequence"],
256+
):
257+
self.assertGreaterEqual(char_lcs, word_lcs)
258+
222259
def test_text_inclusion_augmented_output(self) -> None:
223260
analysis_input = TextInclusionAnalysisInput(
224261
generation_df=pd.DataFrame(self.data)
@@ -255,6 +292,8 @@ def test_multi_target(self) -> None:
255292
target_key="targets",
256293
disable_exact_match=True,
257294
disable_similarity=True,
295+
disable_word_level_longest_common_subsequence=True,
296+
disable_char_level_longest_common_subsequence=True,
258297
)
259298
multi_analysis_node = TextInclusionAnalysisNode(
260299
analysis_input=multi_analysis_input

0 commit comments

Comments
 (0)