Skip to content

Commit 7cc11a3

Browse files
Merge pull request #678 from scieloorg/ajust-match-performance
Ajusta desempenho de match por DOI e insere filtro collection em fixture
2 parents b60c9c5 + 9fba3f1 commit 7cc11a3

3 files changed

Lines changed: 71 additions & 1 deletion

File tree

etl/deduplicator/openalex.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,18 @@ def _search_openalex_by_doi(
131131
logger.warning("Invalid DOI after normalization: %s", doi)
132132
return []
133133

134-
query = {"bool": {"filter": [{"wildcard": {"doi.keyword": f"*{normalized_doi}*"}}]}}
134+
query = {
135+
"bool": {
136+
"filter": [
137+
{
138+
"bool": {
139+
"should": self._doi_exact_or_prefix_queries(normalized_doi),
140+
"minimum_should_match": 1,
141+
}
142+
}
143+
]
144+
}
145+
}
135146
if year is not None:
136147
try:
137148
year = int(year)
@@ -151,6 +162,23 @@ def _search_openalex_by_doi(
151162
logger.error("Error searching OpenAlex by DOI: %s", exc)
152163
return []
153164

165+
def _doi_exact_or_prefix_queries(self, normalized_doi: str) -> list[dict[str, Any]]:
166+
doi_values = [
167+
normalized_doi,
168+
f"https://doi.org/{normalized_doi}",
169+
f"http://doi.org/{normalized_doi}",
170+
f"https://dx.doi.org/{normalized_doi}",
171+
f"http://dx.doi.org/{normalized_doi}",
172+
]
173+
fields = ["doi.keyword", "ids.doi.keyword"]
174+
175+
queries: list[dict[str, Any]] = []
176+
for field in fields:
177+
queries.extend({"term": {field: value}} for value in doi_values)
178+
queries.extend({"prefix": {field: value}} for value in doi_values)
179+
180+
return queries
181+
154182
def _search_openalex_by_isbn(
155183
self,
156184
isbns: List[str],

etl/tests/pipeline/test_pipeline_dedup.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,30 @@ def test_openalex_doi_match_keeps_language_variants_with_same_normalized_doi(sel
214214
)
215215
self.assertTrue(all(match[1] == "doi" for match in matches))
216216

217+
def test_openalex_doi_search_uses_exact_or_prefix_queries(self):
218+
matcher = make_matcher("article")
219+
matcher.input_openalex_index = "raw_openalex_works"
220+
matcher.client = Mock()
221+
matcher.client.client.search.return_value = {"hits": {"hits": []}}
222+
223+
matcher._search_openalex_by_doi(
224+
"10.1590/0034-7167.202578SUPL101",
225+
{"publication_year": 2025},
226+
)
227+
228+
body = matcher.client.client.search.call_args.kwargs["body"]
229+
self.assertNotIn("wildcard", str(body))
230+
doi_filter = body["query"]["bool"]["filter"][0]["bool"]
231+
self.assertEqual(doi_filter["minimum_should_match"], 1)
232+
self.assertIn(
233+
{"prefix": {"doi.keyword": "https://doi.org/10.1590/0034-7167.202578supl101"}},
234+
doi_filter["should"],
235+
)
236+
self.assertIn(
237+
{"term": {"doi.keyword": "10.1590/0034-7167.202578supl101"}},
238+
doi_filter["should"],
239+
)
240+
217241
def _openalex_article(self, openalex_id, language, doi_suffix):
218242
titles = {
219243
"en": "Ethical dilemmas in nursing professionals' work",

search_gateway/fixtures/datasources.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,21 @@
751751
"group_order": 0
752752
}
753753
},
754+
"collection": {
755+
"kind": "index",
756+
"index_field_name": "oca_data.scielo.collection",
757+
"filter": {
758+
"size": 100,
759+
"order": { "_key": "asc" }
760+
},
761+
"settings": {
762+
"label": "SciELO Collection",
763+
"widget": "select",
764+
"multiple_selection": true,
765+
"group": "coverage",
766+
"group_order": 0
767+
}
768+
},
754769

755770
"publication_year": {
756771
"kind": "index",
@@ -1248,6 +1263,7 @@
12481263
{ "value": "scope", "label": "Dataset Coverage" },
12491264
{ "value": "source_indexed_in", "label": "Indexed In (OpenAlex)" },
12501265
{ "value": "source_scielo_indexed_in", "label": "Indexed In (SciELO)" },
1266+
{ "value": "collection", "label": "SciELO Collection" },
12511267
{ "value": "document_type", "label": "Document Type" },
12521268
{ "value": "document_language", "label": "Document Language" },
12531269
{ "value": "open_access", "label": "Open Access" },
@@ -1281,6 +1297,7 @@
12811297
"scope",
12821298
"source_indexed_in",
12831299
"source_scielo_indexed_in",
1300+
"collection",
12841301

12851302
"publication_year",
12861303
"document_type",
@@ -1366,6 +1383,7 @@
13661383
"scope",
13671384
"source_indexed_in",
13681385
"source_scielo_indexed_in",
1386+
"collection",
13691387

13701388
"publication_year",
13711389
"document_type",

0 commit comments

Comments
 (0)