Skip to content

Commit a58fd60

Browse files
melhora e adiciona alguns testes sobre issn e isbn
1 parent b69a95a commit a58fd60

2 files changed

Lines changed: 226 additions & 0 deletions

File tree

etl/tests/domain/test_models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datetime import timedelta
22

3+
from django.core.exceptions import ValidationError
34
from django.test import TestCase
45
from django.utils import timezone
56

@@ -134,3 +135,15 @@ def test_pipeline_config_to_rules_with_partial_rules_json(self):
134135
self.assertEqual(rules["fuzzy_min_similarity"], 0.90)
135136
self.assertEqual(rules["openalex_validation"]["min_score"], 70)
136137
self.assertEqual(rules["doi_requires_title_overlap"], True)
138+
139+
def test_pipeline_config_rejects_issn_openalex_strategy(self):
140+
config = EtlPipelineConfig(
141+
name="article",
142+
input_index="bronze_scielo_articles",
143+
input_document_kind="article",
144+
default_document_type="article",
145+
rules={"openalex_match_strategies": ["doi", "issn", "title"]},
146+
)
147+
148+
with self.assertRaises(ValidationError):
149+
config.to_rules()

etl/tests/pipeline/test_pipeline_dedup.py

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,26 @@ def test_dataset_does_not_use_fuzzy_strategy(self):
6969

7070
self.assertEqual(len(groups), 2)
7171

72+
def test_document_type_openalex_match_strategies_are_domain_specific(self):
73+
self.assertEqual(
74+
EtlPipelineConfig.objects.get_enabled_by_name("article").to_rules()[
75+
"openalex_match_strategies"
76+
],
77+
["doi", "title"],
78+
)
79+
self.assertEqual(
80+
EtlPipelineConfig.objects.get_enabled_by_name("book").to_rules()[
81+
"openalex_match_strategies"
82+
],
83+
["doi", "isbn", "title"],
84+
)
85+
self.assertEqual(
86+
EtlPipelineConfig.objects.get_enabled_by_name("book-chapter").to_rules()[
87+
"openalex_match_strategies"
88+
],
89+
["doi", "isbn", "title"],
90+
)
91+
7292
def test_non_article_targets_build_unit_groups_without_deduplicator(self):
7393
pipeline = OpenSearchETLPipeline.__new__(OpenSearchETLPipeline)
7494
pipeline.pipeline_config = EtlPipelineConfig.objects.get_for_source("bronze_scielo_preprint")
@@ -245,6 +265,199 @@ def test_openalex_doi_search_uses_exact_or_prefix_queries(self):
245265
doi_filter["should"],
246266
)
247267

268+
def test_openalex_title_strategy_runs_when_doi_lookup_has_no_match(self):
269+
matcher = make_matcher("article")
270+
matcher.input_openalex_index = "raw_openalex_works"
271+
matcher.client = Mock()
272+
matcher.client.client.search.side_effect = [
273+
{"hits": {"hits": []}},
274+
{
275+
"hits": {
276+
"hits": [
277+
{"_source": self._openalex_article("https://openalex.org/W1", "en", "")},
278+
]
279+
}
280+
},
281+
]
282+
283+
matches = matcher.find_matches(
284+
[
285+
{
286+
"type": "article",
287+
"publication_year": 2025,
288+
"ids": {"doi": "10.1590/unmatched"},
289+
"title": "Ethical dilemmas in nursing professionals' work",
290+
"source_issns": ["0034-7167", "1984-0446"],
291+
}
292+
],
293+
max_candidates=3,
294+
)
295+
296+
self.assertEqual(len(matches), 1)
297+
self.assertEqual(matches[0][1], "title_year_author")
298+
self.assertEqual(matcher.client.client.search.call_count, 2)
299+
300+
def test_openalex_isbn_search_only_uses_bibliographic_isbn_fields(self):
301+
matcher = make_matcher("book")
302+
matcher.input_openalex_index = "raw_openalex_works"
303+
matcher.client = Mock()
304+
matcher.client.client.search.return_value = {"hits": {"hits": []}}
305+
306+
matcher._search_openalex_by_isbn(
307+
["9786500000001"],
308+
{"publication_year": 2025},
309+
)
310+
311+
body = matcher.client.client.search.call_args.kwargs["body"]
312+
self.assertNotIn("issn", str(body).lower())
313+
self.assertIn(
314+
{"terms": {"ids.isbn.keyword": ["9786500000001"]}},
315+
body["query"]["bool"]["should"],
316+
)
317+
self.assertIn(
318+
{"terms": {"biblio.isbns.keyword": ["9786500000001"]}},
319+
body["query"]["bool"]["should"],
320+
)
321+
322+
def test_openalex_title_search_uses_source_issn_constraint(self):
323+
matcher = make_matcher("article")
324+
matcher.input_openalex_index = "raw_openalex_works"
325+
matcher.client = Mock()
326+
matcher.client.client.search.return_value = {"hits": {"hits": []}}
327+
328+
matcher._search_openalex_by_title_year(
329+
{
330+
"publication_year": 2025,
331+
"title": "Ethical dilemmas in nursing professionals' work",
332+
"source_issns": ["0034-7167", "1984-0446"],
333+
},
334+
)
335+
336+
body = matcher.client.client.search.call_args.kwargs["body"]
337+
self.assertNotIn("isbn", str(body).lower())
338+
self.assertIn(
339+
{
340+
"match": {
341+
"title": {
342+
"query": "Ethical dilemmas in nursing professionals' work",
343+
"minimum_should_match": "90%",
344+
"fuzziness": "AUTO",
345+
}
346+
}
347+
},
348+
body["query"]["bool"]["must"],
349+
)
350+
self.assertEqual(
351+
body["query"]["bool"]["should"],
352+
[
353+
{"terms": {"source.issn.keyword": ["0034-7167", "1984-0446"]}},
354+
{"terms": {"source.issns.keyword": ["0034-7167", "1984-0446"]}},
355+
{"terms": {"primary_location.source.issn.keyword": ["0034-7167", "1984-0446"]}},
356+
{"terms": {"primary_location.source.issns.keyword": ["0034-7167", "1984-0446"]}},
357+
{"terms": {"locations.source.issn.keyword": ["0034-7167", "1984-0446"]}},
358+
{"terms": {"locations.source.issns.keyword": ["0034-7167", "1984-0446"]}},
359+
],
360+
)
361+
self.assertEqual(body["query"]["bool"]["minimum_should_match"], 1)
362+
363+
def test_openalex_title_search_ignores_non_standard_scielo_issn_fields(self):
364+
matcher = make_matcher("article")
365+
matcher.input_openalex_index = "raw_openalex_works"
366+
matcher.client = Mock()
367+
matcher.client.client.search.return_value = {"hits": {"hits": []}}
368+
369+
matcher._search_openalex_by_title_year(
370+
{
371+
"publication_year": 2025,
372+
"title": "Ethical dilemmas in nursing professionals' work",
373+
"journal_issns": ["0034-7167", "1984-0446"],
374+
},
375+
)
376+
377+
body = matcher.client.client.search.call_args.kwargs["body"]
378+
self.assertNotIn("should", body["query"]["bool"])
379+
self.assertNotIn("minimum_should_match", body["query"]["bool"])
380+
381+
def test_openalex_title_strategy_returns_validated_issn_match(self):
382+
matcher = make_matcher("article")
383+
matcher.input_openalex_index = "raw_openalex_works"
384+
matcher.client = Mock()
385+
matcher.client.client.search.return_value = {
386+
"hits": {
387+
"hits": [
388+
{"_source": self._openalex_article("https://openalex.org/W1", "en", "")},
389+
]
390+
}
391+
}
392+
393+
matches = matcher.find_matches(
394+
[
395+
{
396+
"type": "article",
397+
"publication_year": 2025,
398+
"title": "Ethical dilemmas in nursing professionals' work",
399+
"source_issns": ["0034-7167", "1984-0446"],
400+
}
401+
],
402+
max_candidates=3,
403+
)
404+
405+
self.assertEqual(len(matches), 1)
406+
self.assertEqual(matches[0][1], "title_year_author")
407+
self.assertIn("issn_match_2", matches[0][3]["reasons"])
408+
409+
def test_openalex_title_strategy_rejects_low_title_similarity(self):
410+
matcher = make_matcher("article")
411+
matcher.input_openalex_index = "raw_openalex_works"
412+
matcher.client = Mock()
413+
matcher.client.client.search.return_value = {
414+
"hits": {
415+
"hits": [
416+
{"_source": self._openalex_article("https://openalex.org/W1", "en", "")},
417+
]
418+
}
419+
}
420+
421+
matches = matcher.find_matches(
422+
[
423+
{
424+
"type": "article",
425+
"publication_year": 2025,
426+
"title": "Unrelated clinical protocol for dentistry",
427+
"source_issns": ["0034-7167", "1984-0446"],
428+
}
429+
],
430+
max_candidates=3,
431+
)
432+
433+
self.assertEqual(matches, [])
434+
435+
def test_openalex_issn_is_not_an_article_match_strategy(self):
436+
matcher = make_matcher("article")
437+
matcher.input_openalex_index = "raw_openalex_works"
438+
matcher.client = Mock()
439+
matcher.client.client.search.return_value = {
440+
"hits": {
441+
"hits": [
442+
{"_source": self._openalex_article("https://openalex.org/W1", "en", "")},
443+
]
444+
}
445+
}
446+
447+
matches = matcher.find_matches(
448+
[
449+
{
450+
"type": "article",
451+
"publication_year": 2025,
452+
"source_issns": ["0034-7167", "1984-0446"],
453+
}
454+
],
455+
max_candidates=3,
456+
)
457+
458+
self.assertEqual(matches, [])
459+
matcher.client.client.search.assert_not_called()
460+
248461
def test_openalex_match_skips_year_outside_configured_raw_range(self):
249462
matcher = make_matcher("article")
250463
matcher.input_openalex_index = "raw_openalex_works"

0 commit comments

Comments
 (0)