Skip to content

Commit 7eef545

Browse files
fix: classify retraction nature for article flags [AI-assisted] (#1028)
Retraction Watch rows were previously cached as is_retracted=true regardless of RetractionNature. This caused corrections and expressions of concern to be reported as retracted articles.\n\nAdd explicit RetractionNature classification and regression tests so only true retraction-like records are flagged as retracted while preserving expected behavior for real retractions. Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent 0c2d3ef commit 7eef545

File tree

2 files changed

+84
-1
lines changed

2 files changed

+84
-1
lines changed

src/aletheia_probe/updater/sources/retraction_watch.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,14 @@
2727
ARTICLE_BATCH_SIZE = 1000
2828
PROGRESS_LOG_INTERVAL = 5000
2929
CACHE_EXPIRY_HOURS = 24 * 365
30+
NON_RETRACTION_NATURE_KEYWORDS = (
31+
"correction",
32+
"erratum",
33+
"corrigendum",
34+
"expression of concern",
35+
"reinstatement",
36+
)
37+
RETRACTION_NATURE_KEYWORDS = ("retract", "withdraw")
3038

3139

3240
class RetractionWatchSource(DataSource):
@@ -549,11 +557,12 @@ def _collect_article_retractions(self, article_batch: list[dict[str, str]]) -> N
549557
retraction_nature = article.get("retraction_nature", "")
550558
reason = article.get("reason", "")
551559
retraction_doi = article.get("retraction_doi", "")
560+
is_retracted = self._is_retracted_nature(retraction_nature)
552561

553562
self.article_retractions.append(
554563
{
555564
"doi": doi.lower().strip(),
556-
"is_retracted": True,
565+
"is_retracted": is_retracted,
557566
"retraction_type": retraction_nature or "Retraction",
558567
"retraction_date": retraction_date_formatted,
559568
"retraction_doi": retraction_doi if retraction_doi else None,
@@ -562,3 +571,15 @@ def _collect_article_retractions(self, article_batch: list[dict[str, str]]) -> N
562571
"expires_at": expires_at.isoformat(),
563572
}
564573
)
574+
575+
def _is_retracted_nature(self, retraction_nature: str) -> bool:
576+
"""Determine whether a RetractionNature value indicates true retraction."""
577+
normalized = retraction_nature.lower().strip()
578+
if not normalized:
579+
# Keep historical behavior for blank types to avoid dropping valid retractions.
580+
return True
581+
582+
if any(keyword in normalized for keyword in NON_RETRACTION_NATURE_KEYWORDS):
583+
return False
584+
585+
return any(keyword in normalized for keyword in RETRACTION_NATURE_KEYWORDS)

tests/unit/updater/test_retraction_watch_source.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,65 @@ async def test_fetch_data_missing_csv(self, source):
9292
with patch.object(source, "_clone_repository", return_value=repo_path):
9393
result = await source.fetch_data()
9494
assert result == []
95+
96+
@pytest.mark.parametrize(
97+
("retraction_nature", "expected"),
98+
[
99+
("Retraction", True),
100+
("Partial Retraction", True),
101+
("Withdrawal", True),
102+
("Correction", False),
103+
("Erratum", False),
104+
("Corrigendum", False),
105+
("Expression of concern", False),
106+
("Reinstatement", False),
107+
("", True),
108+
],
109+
)
110+
def test_is_retracted_nature(self, source, retraction_nature, expected):
111+
"""Test RetractionNature classification for article-level retraction flag."""
112+
assert source._is_retracted_nature(retraction_nature) is expected
113+
114+
def test_collect_article_retractions_marks_correction_not_retracted(self, source):
115+
"""Test correction records are cached as non-retracted article updates."""
116+
source.article_retractions = []
117+
118+
source._collect_article_retractions(
119+
[
120+
{
121+
"doi": "10.1234/test-correction",
122+
"retraction_date_str": "01/15/2025 00:00",
123+
"retraction_nature": "Correction",
124+
"reason": "Error in figure",
125+
"retraction_doi": "10.1234/correction-notice",
126+
}
127+
]
128+
)
129+
130+
assert len(source.article_retractions) == 1
131+
article = source.article_retractions[0]
132+
assert article["doi"] == "10.1234/test-correction"
133+
assert article["is_retracted"] is False
134+
assert article["retraction_type"] == "Correction"
135+
136+
def test_collect_article_retractions_marks_retraction_as_retracted(self, source):
137+
"""Test retraction records remain cached as retracted articles."""
138+
source.article_retractions = []
139+
140+
source._collect_article_retractions(
141+
[
142+
{
143+
"doi": "10.1234/test-retraction",
144+
"retraction_date_str": "01/15/2025 00:00",
145+
"retraction_nature": "Retraction",
146+
"reason": "Misconduct",
147+
"retraction_doi": "10.1234/retraction-notice",
148+
}
149+
]
150+
)
151+
152+
assert len(source.article_retractions) == 1
153+
article = source.article_retractions[0]
154+
assert article["doi"] == "10.1234/test-retraction"
155+
assert article["is_retracted"] is True
156+
assert article["retraction_type"] == "Retraction"

0 commit comments

Comments
 (0)