Skip to content

Commit 9aa2af5

Browse files
Merge pull request #688 from pitangainnovare/reduz-escopo-de-busca-openalex
Reduz escopo de busca openalex em dedup com scielo
2 parents c104418 + fc29b16 commit 9aa2af5

5 files changed

Lines changed: 151 additions & 34 deletions

File tree

etl/deduplicator/openalex.py

Lines changed: 68 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ def find_matches(
4747

4848
primary = select_primary_scielo_doc(scielo_group)
4949
rules = self.rules
50+
if not self._can_search_openalex(primary):
51+
logger.debug(
52+
"Skipping OpenAlex match lookup for SciELO doc outside configured query scope"
53+
)
54+
return []
55+
5056
matches = []
5157

5258
for strategy in rules["openalex_match_strategies"]:
@@ -119,13 +125,71 @@ def _deduplicate_openalex_matches(self, matches: list) -> list:
119125
deduped.append(match)
120126
return deduped
121127

128+
def _can_search_openalex(self, scielo_doc: dict) -> bool:
129+
year = self._publication_year(scielo_doc)
130+
if year is None:
131+
return False
132+
133+
query_rules = self.rules.get("openalex_query") or {}
134+
min_year = query_rules.get("publication_year_min")
135+
max_year = query_rules.get("publication_year_max")
136+
if min_year is None and max_year is None:
137+
return True
138+
139+
tolerance = self._year_tolerance()
140+
lower = year - tolerance
141+
upper = year + tolerance
142+
143+
try:
144+
if min_year is not None and upper < int(min_year):
145+
return False
146+
if max_year is not None and lower > int(max_year):
147+
return False
148+
except (TypeError, ValueError):
149+
logger.warning("Invalid OpenAlex publication year query bounds: %s", query_rules)
150+
return True
151+
152+
return True
153+
154+
def _publication_year(self, doc: dict) -> int | None:
155+
try:
156+
return int(doc.get("publication_year"))
157+
except (TypeError, ValueError):
158+
return None
159+
160+
def _year_tolerance(self) -> int:
161+
validation_rules = self.rules.get("openalex_validation") or {}
162+
try:
163+
return int(validation_rules.get("year_tolerance", 0) or 0)
164+
except (TypeError, ValueError):
165+
return 0
166+
167+
def _apply_openalex_query_constraints(
168+
self,
169+
query: dict[str, Any],
170+
scielo_doc: dict[str, Any],
171+
) -> dict[str, Any]:
172+
bool_query = query.setdefault("bool", {})
173+
query_rules = self.rules.get("openalex_query") or {}
174+
175+
if query_rules.get("exclude_is_xpac"):
176+
bool_query.setdefault("must_not", []).append({"term": {"is_xpac": True}})
177+
178+
year = self._publication_year(scielo_doc)
179+
if year is not None:
180+
tolerance = self._year_tolerance()
181+
bool_query.setdefault("filter", []).append(
182+
{"range": {"publication_year": {"gte": year - tolerance, "lte": year + tolerance}}}
183+
)
184+
185+
return query
186+
122187
def _search_openalex_by_doi(
123188
self,
124189
doi: str,
125190
scielo_doc: Dict[str, Any],
126191
size: int = 10,
127192
) -> List[Dict[str, Any]]:
128-
year = scielo_doc.get("publication_year")
129193
normalized_doi = normalize_doi(doi)
130194
if not normalized_doi:
131195
logger.warning("Invalid DOI after normalization: %s", doi)
@@ -143,14 +207,7 @@ def _search_openalex_by_doi(
143207
]
144208
}
145209
}
146-
if year is not None:
147-
try:
148-
year = int(year)
149-
query["bool"]["filter"].append(
150-
{"range": {"publication_year": {"gte": year - 1, "lte": year + 1}}}
151-
)
152-
except (ValueError, TypeError):
153-
pass
210+
self._apply_openalex_query_constraints(query, scielo_doc)
154211

155212
try:
156213
response = self.client.client.search(
@@ -185,7 +242,6 @@ def _search_openalex_by_isbn(
185242
scielo_doc: Dict[str, Any],
186243
size: int = 10,
187244
) -> List[Dict[str, Any]]:
188-
year = scielo_doc.get("publication_year")
189245
query = {
190246
"bool": {
191247
"should": [
@@ -201,15 +257,7 @@ def _search_openalex_by_isbn(
201257
"minimum_should_match": 1,
202258
}
203259
}
204-
205-
if year is not None:
206-
try:
207-
year = int(year)
208-
query["bool"]["filter"] = [
209-
{"range": {"publication_year": {"gte": year - 1, "lte": year + 1}}}
210-
]
211-
except (ValueError, TypeError):
212-
pass
260+
self._apply_openalex_query_constraints(query, scielo_doc)
213261

214262
try:
215263
response = self.client.client.search(
@@ -227,7 +275,6 @@ def _search_openalex_by_title_year(
227275
size: int = 10,
228276
) -> List[Dict[str, Any]]:
229277
title = scielo_doc.get("title", "")
230-
year = scielo_doc.get("publication_year")
231278
issns = scielo_doc.get("source_issns", [])
232279
if not title:
233280
return []
@@ -247,16 +294,7 @@ def _search_openalex_by_title_year(
247294
]
248295
}
249296
}
250-
251-
if year:
252-
try:
253-
year_int = int(year)
254-
except (TypeError, ValueError):
255-
logger.warning("Invalid year value: %s, skipping range filter", year)
256-
return []
257-
query["bool"]["filter"] = [
258-
{"range": {"publication_year": {"gte": year_int - 1, "lte": year_int + 1}}}
259-
]
297+
self._apply_openalex_query_constraints(query, scielo_doc)
260298

261299
if issns:
262300
query["bool"]["should"] = [{"terms": {"source.issns": issns}}]

etl/fixtures/pipeline_configs.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
"rules": {
1414
"scielo_dedup_strategies": ["doi", "pid", "fuzzy"],
1515
"openalex_match_strategies": ["doi", "isbn", "title"],
16+
"openalex_query": {
17+
"exclude_is_xpac": true,
18+
"publication_year_min": 2019,
19+
"publication_year_max": 2025
20+
},
1621
"doi_requires_title_overlap": true,
1722
"pid_requires_year_match": true,
1823
"pid_requires_source_match": true,

etl/models.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@
5454
"isbn_title_threshold": 0.80,
5555
}
5656

57+
DEFAULT_OPENALEX_QUERY_RULES = {
58+
"exclude_is_xpac": False,
59+
"publication_year_min": None,
60+
"publication_year_max": None,
61+
}
62+
5763

5864
class EtlStatus(models.TextChoices):
5965
PENDING = "pending", "Pending"
@@ -240,6 +246,7 @@ def to_rules(self) -> dict:
240246
self.clean()
241247
rules = self.rules or {}
242248
oa_val = rules.get("openalex_validation", {})
249+
oa_query = rules.get("openalex_query", {})
243250
return {
244251
"document_type": normalize_document_type_for_etl(self.default_document_type),
245252
"scielo_dedup_strategies": list(rules.get("scielo_dedup_strategies", [])),
@@ -255,6 +262,10 @@ def to_rules(self) -> dict:
255262
**DEFAULT_OPENALEX_VALIDATION_RULES,
256263
**oa_val,
257264
},
265+
"openalex_query": {
266+
**DEFAULT_OPENALEX_QUERY_RULES,
267+
**oa_query,
268+
},
258269
}
259270

260271

etl/tests/pipeline/test_pipeline_dedup.py

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
from unittest.mock import Mock
22

3-
from django.test import TestCase
4-
53
from etl.deduplicator.helpers import can_compare, rules_for_pair
64
from etl.deduplicator.openalex import OpenAlexMatcher
75
from etl.deduplicator.scielo import SciELODeduplicator
86
from etl.models import EtlPipelineConfig
97
from etl.pipeline import OpenSearchETLPipeline
8+
from etl.tests.base import EtlTestCase
109

1110

1211
def make_deduplicator(document_type):
@@ -19,7 +18,7 @@ def make_matcher(document_type):
1918
return matcher
2019

2120

22-
class DocumentRulesTests(TestCase):
21+
class DocumentRulesTests(EtlTestCase):
2322
def test_can_compare_rejects_doc_type_outside_pipeline_config_rules(self):
2423
rules = EtlPipelineConfig.objects.get_enabled_by_name("book").to_rules()
2524

@@ -229,6 +228,14 @@ def test_openalex_doi_search_uses_exact_or_prefix_queries(self):
229228
self.assertNotIn("wildcard", str(body))
230229
doi_filter = body["query"]["bool"]["filter"][0]["bool"]
231230
self.assertEqual(doi_filter["minimum_should_match"], 1)
231+
self.assertEqual(
232+
body["query"]["bool"]["must_not"],
233+
[{"term": {"is_xpac": True}}],
234+
)
235+
self.assertIn(
236+
{"range": {"publication_year": {"gte": 2024, "lte": 2026}}},
237+
body["query"]["bool"]["filter"],
238+
)
232239
self.assertIn(
233240
{"prefix": {"doi.keyword": "https://doi.org/10.1590/0034-7167.202578supl101"}},
234241
doi_filter["should"],
@@ -238,6 +245,62 @@ def test_openalex_doi_search_uses_exact_or_prefix_queries(self):
238245
doi_filter["should"],
239246
)
240247

248+
def test_openalex_match_skips_year_outside_configured_raw_range(self):
249+
matcher = make_matcher("article")
250+
matcher.input_openalex_index = "raw_openalex_works"
251+
matcher.client = Mock()
252+
253+
matches = matcher.find_matches(
254+
[
255+
{
256+
"type": "article",
257+
"publication_year": 2017,
258+
"ids": {"doi": "10.1590/0034-7167.202578SUPL101"},
259+
}
260+
],
261+
max_candidates=3,
262+
)
263+
264+
self.assertEqual(matches, [])
265+
matcher.client.client.search.assert_not_called()
266+
267+
def test_openalex_match_keeps_year_adjacent_to_configured_raw_range(self):
268+
matcher = make_matcher("article")
269+
matcher.input_openalex_index = "raw_openalex_works"
270+
matcher.client = Mock()
271+
matcher.client.client.search.return_value = {"hits": {"hits": []}}
272+
273+
matcher.find_matches(
274+
[
275+
{
276+
"type": "article",
277+
"publication_year": 2018,
278+
"ids": {"doi": "10.1590/0034-7167.202578SUPL101"},
279+
}
280+
],
281+
max_candidates=3,
282+
)
283+
284+
matcher.client.client.search.assert_called_once()
285+
286+
def test_openalex_match_skips_missing_scielo_publication_year(self):
287+
matcher = make_matcher("article")
288+
matcher.input_openalex_index = "raw_openalex_works"
289+
matcher.client = Mock()
290+
291+
matches = matcher.find_matches(
292+
[
293+
{
294+
"type": "article",
295+
"ids": {"doi": "10.1590/0034-7167.202578SUPL101"},
296+
}
297+
],
298+
max_candidates=3,
299+
)
300+
301+
self.assertEqual(matches, [])
302+
matcher.client.client.search.assert_not_called()
303+
241304
def _openalex_article(self, openalex_id, language, doi_suffix):
242305
titles = {
243306
"en": "Ethical dilemmas in nursing professionals' work",

search_gateway/tests/test_transforms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
class ScopeDisplayTransformTests(SimpleTestCase):
77
def test_openalex_works_label(self):
8-
self.assertEqual(apply_display_transform("scope", "openalex_works"), "OpenAlex")
8+
self.assertEqual(apply_display_transform("scope", "openalex"), "OpenAlex")
99

1010
def test_scielo_label(self):
1111
self.assertEqual(apply_display_transform("scope", "scielo"), "SciELO")

0 commit comments

Comments
 (0)