@@ -69,6 +69,26 @@ def test_dataset_does_not_use_fuzzy_strategy(self):
6969
7070 self .assertEqual (len (groups ), 2 )
7171
72+ def test_document_type_openalex_match_strategies_are_domain_specific (self ):
73+ self .assertEqual (
74+ EtlPipelineConfig .objects .get_enabled_by_name ("article" ).to_rules ()[
75+ "openalex_match_strategies"
76+ ],
77+ ["doi" , "title" ],
78+ )
79+ self .assertEqual (
80+ EtlPipelineConfig .objects .get_enabled_by_name ("book" ).to_rules ()[
81+ "openalex_match_strategies"
82+ ],
83+ ["doi" , "isbn" , "title" ],
84+ )
85+ self .assertEqual (
86+ EtlPipelineConfig .objects .get_enabled_by_name ("book-chapter" ).to_rules ()[
87+ "openalex_match_strategies"
88+ ],
89+ ["doi" , "isbn" , "title" ],
90+ )
91+
7292 def test_non_article_targets_build_unit_groups_without_deduplicator (self ):
7393 pipeline = OpenSearchETLPipeline .__new__ (OpenSearchETLPipeline )
7494 pipeline .pipeline_config = EtlPipelineConfig .objects .get_for_source ("bronze_scielo_preprint" )
@@ -245,6 +265,199 @@ def test_openalex_doi_search_uses_exact_or_prefix_queries(self):
245265 doi_filter ["should" ],
246266 )
247267
268+ def test_openalex_title_strategy_runs_when_doi_lookup_has_no_match (self ):
269+ matcher = make_matcher ("article" )
270+ matcher .input_openalex_index = "raw_openalex_works"
271+ matcher .client = Mock ()
272+ matcher .client .client .search .side_effect = [
273+ {"hits" : {"hits" : []}},
274+ {
275+ "hits" : {
276+ "hits" : [
277+ {"_source" : self ._openalex_article ("https://openalex.org/W1" , "en" , "" )},
278+ ]
279+ }
280+ },
281+ ]
282+
283+ matches = matcher .find_matches (
284+ [
285+ {
286+ "type" : "article" ,
287+ "publication_year" : 2025 ,
288+ "ids" : {"doi" : "10.1590/unmatched" },
289+ "title" : "Ethical dilemmas in nursing professionals' work" ,
290+ "source_issns" : ["0034-7167" , "1984-0446" ],
291+ }
292+ ],
293+ max_candidates = 3 ,
294+ )
295+
296+ self .assertEqual (len (matches ), 1 )
297+ self .assertEqual (matches [0 ][1 ], "title_year_author" )
298+ self .assertEqual (matcher .client .client .search .call_count , 2 )
299+
300+ def test_openalex_isbn_search_only_uses_bibliographic_isbn_fields (self ):
301+ matcher = make_matcher ("book" )
302+ matcher .input_openalex_index = "raw_openalex_works"
303+ matcher .client = Mock ()
304+ matcher .client .client .search .return_value = {"hits" : {"hits" : []}}
305+
306+ matcher ._search_openalex_by_isbn (
307+ ["9786500000001" ],
308+ {"publication_year" : 2025 },
309+ )
310+
311+ body = matcher .client .client .search .call_args .kwargs ["body" ]
312+ self .assertNotIn ("issn" , str (body ).lower ())
313+ self .assertIn (
314+ {"terms" : {"ids.isbn.keyword" : ["9786500000001" ]}},
315+ body ["query" ]["bool" ]["should" ],
316+ )
317+ self .assertIn (
318+ {"terms" : {"biblio.isbns.keyword" : ["9786500000001" ]}},
319+ body ["query" ]["bool" ]["should" ],
320+ )
321+
322+ def test_openalex_title_search_uses_source_issn_constraint (self ):
323+ matcher = make_matcher ("article" )
324+ matcher .input_openalex_index = "raw_openalex_works"
325+ matcher .client = Mock ()
326+ matcher .client .client .search .return_value = {"hits" : {"hits" : []}}
327+
328+ matcher ._search_openalex_by_title_year (
329+ {
330+ "publication_year" : 2025 ,
331+ "title" : "Ethical dilemmas in nursing professionals' work" ,
332+ "source_issns" : ["0034-7167" , "1984-0446" ],
333+ },
334+ )
335+
336+ body = matcher .client .client .search .call_args .kwargs ["body" ]
337+ self .assertNotIn ("isbn" , str (body ).lower ())
338+ self .assertIn (
339+ {
340+ "match" : {
341+ "title" : {
342+ "query" : "Ethical dilemmas in nursing professionals' work" ,
343+ "minimum_should_match" : "90%" ,
344+ "fuzziness" : "AUTO" ,
345+ }
346+ }
347+ },
348+ body ["query" ]["bool" ]["must" ],
349+ )
350+ self .assertEqual (
351+ body ["query" ]["bool" ]["should" ],
352+ [
353+ {"terms" : {"source.issn.keyword" : ["0034-7167" , "1984-0446" ]}},
354+ {"terms" : {"source.issns.keyword" : ["0034-7167" , "1984-0446" ]}},
355+ {"terms" : {"primary_location.source.issn.keyword" : ["0034-7167" , "1984-0446" ]}},
356+ {"terms" : {"primary_location.source.issns.keyword" : ["0034-7167" , "1984-0446" ]}},
357+ {"terms" : {"locations.source.issn.keyword" : ["0034-7167" , "1984-0446" ]}},
358+ {"terms" : {"locations.source.issns.keyword" : ["0034-7167" , "1984-0446" ]}},
359+ ],
360+ )
361+ self .assertEqual (body ["query" ]["bool" ]["minimum_should_match" ], 1 )
362+
363+ def test_openalex_title_search_ignores_non_standard_scielo_issn_fields (self ):
364+ matcher = make_matcher ("article" )
365+ matcher .input_openalex_index = "raw_openalex_works"
366+ matcher .client = Mock ()
367+ matcher .client .client .search .return_value = {"hits" : {"hits" : []}}
368+
369+ matcher ._search_openalex_by_title_year (
370+ {
371+ "publication_year" : 2025 ,
372+ "title" : "Ethical dilemmas in nursing professionals' work" ,
373+ "journal_issns" : ["0034-7167" , "1984-0446" ],
374+ },
375+ )
376+
377+ body = matcher .client .client .search .call_args .kwargs ["body" ]
378+ self .assertNotIn ("should" , body ["query" ]["bool" ])
379+ self .assertNotIn ("minimum_should_match" , body ["query" ]["bool" ])
380+
381+ def test_openalex_title_strategy_returns_validated_issn_match (self ):
382+ matcher = make_matcher ("article" )
383+ matcher .input_openalex_index = "raw_openalex_works"
384+ matcher .client = Mock ()
385+ matcher .client .client .search .return_value = {
386+ "hits" : {
387+ "hits" : [
388+ {"_source" : self ._openalex_article ("https://openalex.org/W1" , "en" , "" )},
389+ ]
390+ }
391+ }
392+
393+ matches = matcher .find_matches (
394+ [
395+ {
396+ "type" : "article" ,
397+ "publication_year" : 2025 ,
398+ "title" : "Ethical dilemmas in nursing professionals' work" ,
399+ "source_issns" : ["0034-7167" , "1984-0446" ],
400+ }
401+ ],
402+ max_candidates = 3 ,
403+ )
404+
405+ self .assertEqual (len (matches ), 1 )
406+ self .assertEqual (matches [0 ][1 ], "title_year_author" )
407+ self .assertIn ("issn_match_2" , matches [0 ][3 ]["reasons" ])
408+
409+ def test_openalex_title_strategy_rejects_low_title_similarity (self ):
410+ matcher = make_matcher ("article" )
411+ matcher .input_openalex_index = "raw_openalex_works"
412+ matcher .client = Mock ()
413+ matcher .client .client .search .return_value = {
414+ "hits" : {
415+ "hits" : [
416+ {"_source" : self ._openalex_article ("https://openalex.org/W1" , "en" , "" )},
417+ ]
418+ }
419+ }
420+
421+ matches = matcher .find_matches (
422+ [
423+ {
424+ "type" : "article" ,
425+ "publication_year" : 2025 ,
426+ "title" : "Unrelated clinical protocol for dentistry" ,
427+ "source_issns" : ["0034-7167" , "1984-0446" ],
428+ }
429+ ],
430+ max_candidates = 3 ,
431+ )
432+
433+ self .assertEqual (matches , [])
434+
435+ def test_openalex_issn_is_not_an_article_match_strategy (self ):
436+ matcher = make_matcher ("article" )
437+ matcher .input_openalex_index = "raw_openalex_works"
438+ matcher .client = Mock ()
439+ matcher .client .client .search .return_value = {
440+ "hits" : {
441+ "hits" : [
442+ {"_source" : self ._openalex_article ("https://openalex.org/W1" , "en" , "" )},
443+ ]
444+ }
445+ }
446+
447+ matches = matcher .find_matches (
448+ [
449+ {
450+ "type" : "article" ,
451+ "publication_year" : 2025 ,
452+ "source_issns" : ["0034-7167" , "1984-0446" ],
453+ }
454+ ],
455+ max_candidates = 3 ,
456+ )
457+
458+ self .assertEqual (matches , [])
459+ matcher .client .client .search .assert_not_called ()
460+
248461 def test_openalex_match_skips_year_outside_configured_raw_range (self ):
249462 matcher = make_matcher ("article" )
250463 matcher .input_openalex_index = "raw_openalex_works"
0 commit comments