Skip to content

Commit c13c2a5

Browse files
Merge pull request #692 from pitangainnovare/silver-004
Silver 004 - Realiza otimizações no match de docs scielo (article) com openalex
2 parents 34849d2 + a58fd60 commit c13c2a5

6 files changed

Lines changed: 242 additions & 27 deletions

File tree

etl/deduplicator/openalex.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def _try_openalex_by_isbn(self, primary: dict, max_candidates: int) -> list:
9797
return matches
9898

9999
def _try_openalex_by_title(self, primary: dict, max_candidates: int) -> list:
100-
if extract_doi(primary) or extract_isbns(primary):
100+
if extract_isbns(primary):
101101
return []
102102

103103
matches = []
@@ -251,8 +251,6 @@ def _search_openalex_by_isbn(
251251
{"terms": {"ids.eisbns.keyword": isbns}},
252252
{"terms": {"biblio.isbn.keyword": isbns}},
253253
{"terms": {"biblio.isbns.keyword": isbns}},
254-
{"terms": {"primary_location.source.issns.keyword": isbns}},
255-
{"terms": {"locations.source.issns.keyword": isbns}},
256254
],
257255
"minimum_should_match": 1,
258256
}
@@ -275,7 +273,7 @@ def _search_openalex_by_title_year(
275273
size: int = 10,
276274
) -> List[Dict[str, Any]]:
277275
title = scielo_doc.get("title", "")
278-
issns = scielo_doc.get("source_issns", [])
276+
issns = scielo_doc.get("source_issns") or []
279277
if not title:
280278
return []
281279

@@ -297,7 +295,7 @@ def _search_openalex_by_title_year(
297295
self._apply_openalex_query_constraints(query, scielo_doc)
298296

299297
if issns:
300-
query["bool"]["should"] = [{"terms": {"source.issns": issns}}]
298+
query["bool"]["should"] = self._source_issn_queries(issns)
301299
query["bool"]["minimum_should_match"] = 1
302300

303301
try:
@@ -310,6 +308,16 @@ def _search_openalex_by_title_year(
310308
logger.error("Error searching OpenAlex by title: %s", exc)
311309
return []
312310

311+
def _source_issn_queries(self, issns: list[str]) -> list[dict[str, Any]]:
312+
return [
313+
{"terms": {"source.issn.keyword": issns}},
314+
{"terms": {"source.issns.keyword": issns}},
315+
{"terms": {"primary_location.source.issn.keyword": issns}},
316+
{"terms": {"primary_location.source.issns.keyword": issns}},
317+
{"terms": {"locations.source.issn.keyword": issns}},
318+
{"terms": {"locations.source.issns.keyword": issns}},
319+
]
320+
313321
def _validate_openalex_match(
314322
self,
315323
scielo_doc: Dict[str, Any],

etl/fixtures/pipeline_configs.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"openalex_index": "raw_openalex_works",
1313
"rules": {
1414
"scielo_dedup_strategies": ["doi", "pid", "fuzzy"],
15-
"openalex_match_strategies": ["doi", "isbn", "title"],
15+
"openalex_match_strategies": ["doi", "title"],
1616
"openalex_query": {
1717
"exclude_is_xpac": true,
1818
"publication_year_min": 2019,

etl/tests/domain/test_models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datetime import timedelta
22

3+
from django.core.exceptions import ValidationError
34
from django.test import TestCase
45
from django.utils import timezone
56

@@ -134,3 +135,15 @@ def test_pipeline_config_to_rules_with_partial_rules_json(self):
134135
self.assertEqual(rules["fuzzy_min_similarity"], 0.90)
135136
self.assertEqual(rules["openalex_validation"]["min_score"], 70)
136137
self.assertEqual(rules["doi_requires_title_overlap"], True)
138+
139+
def test_pipeline_config_rejects_issn_openalex_strategy(self):
140+
config = EtlPipelineConfig(
141+
name="article",
142+
input_index="bronze_scielo_articles",
143+
input_document_kind="article",
144+
default_document_type="article",
145+
rules={"openalex_match_strategies": ["doi", "issn", "title"]},
146+
)
147+
148+
with self.assertRaises(ValidationError):
149+
config.to_rules()

etl/tests/pipeline/test_pipeline_dedup.py

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,26 @@ def test_dataset_does_not_use_fuzzy_strategy(self):
6969

7070
self.assertEqual(len(groups), 2)
7171

72+
def test_document_type_openalex_match_strategies_are_domain_specific(self):
73+
self.assertEqual(
74+
EtlPipelineConfig.objects.get_enabled_by_name("article").to_rules()[
75+
"openalex_match_strategies"
76+
],
77+
["doi", "title"],
78+
)
79+
self.assertEqual(
80+
EtlPipelineConfig.objects.get_enabled_by_name("book").to_rules()[
81+
"openalex_match_strategies"
82+
],
83+
["doi", "isbn", "title"],
84+
)
85+
self.assertEqual(
86+
EtlPipelineConfig.objects.get_enabled_by_name("book-chapter").to_rules()[
87+
"openalex_match_strategies"
88+
],
89+
["doi", "isbn", "title"],
90+
)
91+
7292
def test_non_article_targets_build_unit_groups_without_deduplicator(self):
7393
pipeline = OpenSearchETLPipeline.__new__(OpenSearchETLPipeline)
7494
pipeline.pipeline_config = EtlPipelineConfig.objects.get_for_source("bronze_scielo_preprint")
@@ -245,6 +265,199 @@ def test_openalex_doi_search_uses_exact_or_prefix_queries(self):
245265
doi_filter["should"],
246266
)
247267

268+
def test_openalex_title_strategy_runs_when_doi_lookup_has_no_match(self):
269+
matcher = make_matcher("article")
270+
matcher.input_openalex_index = "raw_openalex_works"
271+
matcher.client = Mock()
272+
matcher.client.client.search.side_effect = [
273+
{"hits": {"hits": []}},
274+
{
275+
"hits": {
276+
"hits": [
277+
{"_source": self._openalex_article("https://openalex.org/W1", "en", "")},
278+
]
279+
}
280+
},
281+
]
282+
283+
matches = matcher.find_matches(
284+
[
285+
{
286+
"type": "article",
287+
"publication_year": 2025,
288+
"ids": {"doi": "10.1590/unmatched"},
289+
"title": "Ethical dilemmas in nursing professionals' work",
290+
"source_issns": ["0034-7167", "1984-0446"],
291+
}
292+
],
293+
max_candidates=3,
294+
)
295+
296+
self.assertEqual(len(matches), 1)
297+
self.assertEqual(matches[0][1], "title_year_author")
298+
self.assertEqual(matcher.client.client.search.call_count, 2)
299+
300+
def test_openalex_isbn_search_only_uses_bibliographic_isbn_fields(self):
301+
matcher = make_matcher("book")
302+
matcher.input_openalex_index = "raw_openalex_works"
303+
matcher.client = Mock()
304+
matcher.client.client.search.return_value = {"hits": {"hits": []}}
305+
306+
matcher._search_openalex_by_isbn(
307+
["9786500000001"],
308+
{"publication_year": 2025},
309+
)
310+
311+
body = matcher.client.client.search.call_args.kwargs["body"]
312+
self.assertNotIn("issn", str(body).lower())
313+
self.assertIn(
314+
{"terms": {"ids.isbn.keyword": ["9786500000001"]}},
315+
body["query"]["bool"]["should"],
316+
)
317+
self.assertIn(
318+
{"terms": {"biblio.isbns.keyword": ["9786500000001"]}},
319+
body["query"]["bool"]["should"],
320+
)
321+
322+
def test_openalex_title_search_uses_source_issn_constraint(self):
323+
matcher = make_matcher("article")
324+
matcher.input_openalex_index = "raw_openalex_works"
325+
matcher.client = Mock()
326+
matcher.client.client.search.return_value = {"hits": {"hits": []}}
327+
328+
matcher._search_openalex_by_title_year(
329+
{
330+
"publication_year": 2025,
331+
"title": "Ethical dilemmas in nursing professionals' work",
332+
"source_issns": ["0034-7167", "1984-0446"],
333+
},
334+
)
335+
336+
body = matcher.client.client.search.call_args.kwargs["body"]
337+
self.assertNotIn("isbn", str(body).lower())
338+
self.assertIn(
339+
{
340+
"match": {
341+
"title": {
342+
"query": "Ethical dilemmas in nursing professionals' work",
343+
"minimum_should_match": "90%",
344+
"fuzziness": "AUTO",
345+
}
346+
}
347+
},
348+
body["query"]["bool"]["must"],
349+
)
350+
self.assertEqual(
351+
body["query"]["bool"]["should"],
352+
[
353+
{"terms": {"source.issn.keyword": ["0034-7167", "1984-0446"]}},
354+
{"terms": {"source.issns.keyword": ["0034-7167", "1984-0446"]}},
355+
{"terms": {"primary_location.source.issn.keyword": ["0034-7167", "1984-0446"]}},
356+
{"terms": {"primary_location.source.issns.keyword": ["0034-7167", "1984-0446"]}},
357+
{"terms": {"locations.source.issn.keyword": ["0034-7167", "1984-0446"]}},
358+
{"terms": {"locations.source.issns.keyword": ["0034-7167", "1984-0446"]}},
359+
],
360+
)
361+
self.assertEqual(body["query"]["bool"]["minimum_should_match"], 1)
362+
363+
def test_openalex_title_search_ignores_non_standard_scielo_issn_fields(self):
364+
matcher = make_matcher("article")
365+
matcher.input_openalex_index = "raw_openalex_works"
366+
matcher.client = Mock()
367+
matcher.client.client.search.return_value = {"hits": {"hits": []}}
368+
369+
matcher._search_openalex_by_title_year(
370+
{
371+
"publication_year": 2025,
372+
"title": "Ethical dilemmas in nursing professionals' work",
373+
"journal_issns": ["0034-7167", "1984-0446"],
374+
},
375+
)
376+
377+
body = matcher.client.client.search.call_args.kwargs["body"]
378+
self.assertNotIn("should", body["query"]["bool"])
379+
self.assertNotIn("minimum_should_match", body["query"]["bool"])
380+
381+
def test_openalex_title_strategy_returns_validated_issn_match(self):
382+
matcher = make_matcher("article")
383+
matcher.input_openalex_index = "raw_openalex_works"
384+
matcher.client = Mock()
385+
matcher.client.client.search.return_value = {
386+
"hits": {
387+
"hits": [
388+
{"_source": self._openalex_article("https://openalex.org/W1", "en", "")},
389+
]
390+
}
391+
}
392+
393+
matches = matcher.find_matches(
394+
[
395+
{
396+
"type": "article",
397+
"publication_year": 2025,
398+
"title": "Ethical dilemmas in nursing professionals' work",
399+
"source_issns": ["0034-7167", "1984-0446"],
400+
}
401+
],
402+
max_candidates=3,
403+
)
404+
405+
self.assertEqual(len(matches), 1)
406+
self.assertEqual(matches[0][1], "title_year_author")
407+
self.assertIn("issn_match_2", matches[0][3]["reasons"])
408+
409+
def test_openalex_title_strategy_rejects_low_title_similarity(self):
410+
matcher = make_matcher("article")
411+
matcher.input_openalex_index = "raw_openalex_works"
412+
matcher.client = Mock()
413+
matcher.client.client.search.return_value = {
414+
"hits": {
415+
"hits": [
416+
{"_source": self._openalex_article("https://openalex.org/W1", "en", "")},
417+
]
418+
}
419+
}
420+
421+
matches = matcher.find_matches(
422+
[
423+
{
424+
"type": "article",
425+
"publication_year": 2025,
426+
"title": "Unrelated clinical protocol for dentistry",
427+
"source_issns": ["0034-7167", "1984-0446"],
428+
}
429+
],
430+
max_candidates=3,
431+
)
432+
433+
self.assertEqual(matches, [])
434+
435+
def test_openalex_issn_is_not_an_article_match_strategy(self):
436+
matcher = make_matcher("article")
437+
matcher.input_openalex_index = "raw_openalex_works"
438+
matcher.client = Mock()
439+
matcher.client.client.search.return_value = {
440+
"hits": {
441+
"hits": [
442+
{"_source": self._openalex_article("https://openalex.org/W1", "en", "")},
443+
]
444+
}
445+
}
446+
447+
matches = matcher.find_matches(
448+
[
449+
{
450+
"type": "article",
451+
"publication_year": 2025,
452+
"source_issns": ["0034-7167", "1984-0446"],
453+
}
454+
],
455+
max_candidates=3,
456+
)
457+
458+
self.assertEqual(matches, [])
459+
matcher.client.client.search.assert_not_called()
460+
248461
def test_openalex_match_skips_year_outside_configured_raw_range(self):
249462
matcher = make_matcher("article")
250463
matcher.input_openalex_index = "raw_openalex_works"

etl/tests/transform/test_extractors.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def test_extract_doi_reads_top_level_ids_and_language_items(self):
2222
"10.1590/c",
2323
)
2424

25-
def test_extract_isbns_reads_document_parent_and_location_values(self):
25+
def test_extract_isbns_reads_document_and_parent_values(self):
2626
doc = {
2727
"isbn": "978-65-00-00000-1",
2828
"parent_book": {"ids": {"isbn": "85-359-0277-5"}},
@@ -31,7 +31,7 @@ def test_extract_isbns_reads_document_parent_and_location_values(self):
3131

3232
self.assertEqual(
3333
extract_isbns(doc),
34-
["8535902775", "9786500000001", "9788500000002"],
34+
["8535902775", "9786500000001"],
3535
)
3636

3737
def test_extract_issns_normalizes_source_values(self):

etl/transform/extractors.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -48,25 +48,6 @@ def extract_isbns(doc: dict[str, Any]) -> list[str]:
4848
elif raw_value:
4949
values.append(raw_value)
5050

51-
for location_key in ("primary_location", "best_oa_location"):
52-
location = doc.get(location_key) if isinstance(doc.get(location_key), dict) else {}
53-
source = location.get("source") if isinstance(location.get("source"), dict) else {}
54-
raw_value = source.get("issns") or source.get("issn")
55-
if isinstance(raw_value, list):
56-
values.extend(raw_value)
57-
elif raw_value:
58-
values.append(raw_value)
59-
60-
for location in doc.get("locations") or []:
61-
if not isinstance(location, dict):
62-
continue
63-
source = location.get("source") if isinstance(location.get("source"), dict) else {}
64-
raw_value = source.get("issns") or source.get("issn")
65-
if isinstance(raw_value, list):
66-
values.extend(raw_value)
67-
elif raw_value:
68-
values.append(raw_value)
69-
7051
return sorted({normalized for value in values if (normalized := normalize_isbn(value))})
7152

7253

0 commit comments

Comments
 (0)