@@ -47,6 +47,12 @@ def find_matches(
4747
4848 primary = select_primary_scielo_doc (scielo_group )
4949 rules = self .rules
50+ if not self ._can_search_openalex (primary ):
51+ logger .debug (
52+ "Skipping OpenAlex match lookup for SciELO doc outside configured query scope"
53+ )
54+ return []
55+
5056 matches = []
5157
5258 for strategy in rules ["openalex_match_strategies" ]:
@@ -119,13 +125,71 @@ def _deduplicate_openalex_matches(self, matches: list) -> list:
119125 deduped .append (match )
120126 return deduped
121127
128+ def _can_search_openalex (self , scielo_doc : dict ) -> bool :
129+ year = self ._publication_year (scielo_doc )
130+ if year is None :
131+ return False
132+
133+ query_rules = self .rules .get ("openalex_query" ) or {}
134+ min_year = query_rules .get ("publication_year_min" )
135+ max_year = query_rules .get ("publication_year_max" )
136+ if min_year is None and max_year is None :
137+ return True
138+
139+ tolerance = self ._year_tolerance ()
140+ lower = year - tolerance
141+ upper = year + tolerance
142+
143+ try :
144+ if min_year is not None and upper < int (min_year ):
145+ return False
146+ if max_year is not None and lower > int (max_year ):
147+ return False
148+ except (TypeError , ValueError ):
149+ logger .warning ("Invalid OpenAlex publication year query bounds: %s" , query_rules )
150+ return True
151+
152+ return True
153+
154+ def _publication_year (self , doc : dict ) -> int | None :
155+ try :
156+ return int (doc .get ("publication_year" ))
157+ except (TypeError , ValueError ):
158+ return None
159+
160+ def _year_tolerance (self ) -> int :
161+ validation_rules = self .rules .get ("openalex_validation" ) or {}
162+ try :
163+ return int (validation_rules .get ("year_tolerance" , 0 ) or 0 )
164+ except (TypeError , ValueError ):
165+ return 0
166+
167+ def _apply_openalex_query_constraints (
168+ self ,
169+ query : dict [str , Any ],
170+ scielo_doc : dict [str , Any ],
171+ ) -> dict [str , Any ]:
172+ bool_query = query .setdefault ("bool" , {})
173+ query_rules = self .rules .get ("openalex_query" ) or {}
174+
175+ if query_rules .get ("exclude_is_xpac" ):
176+ bool_query .setdefault ("must_not" , []).append ({"term" : {"is_xpac" : True }})
177+
178+ year = self ._publication_year (scielo_doc )
179+ if year is not None :
180+ tolerance = self ._year_tolerance ()
181+ bool_query .setdefault ("filter" , []).append (
182+ {"range" : {"publication_year" : {"gte" : year - tolerance , "lte" : year + tolerance }}}
183+ )
184+
185+ return query
186+
122187 def _search_openalex_by_doi (
123188 self ,
124189 doi : str ,
125190 scielo_doc : Dict [str , Any ],
126191 size : int = 10 ,
127192 ) -> List [Dict [str , Any ]]:
128- year = scielo_doc .get ("publication_year" )
129193 normalized_doi = normalize_doi (doi )
130194 if not normalized_doi :
131195 logger .warning ("Invalid DOI after normalization: %s" , doi )
@@ -143,14 +207,7 @@ def _search_openalex_by_doi(
143207 ]
144208 }
145209 }
146- if year is not None :
147- try :
148- year = int (year )
149- query ["bool" ]["filter" ].append (
150- {"range" : {"publication_year" : {"gte" : year - 1 , "lte" : year + 1 }}}
151- )
152- except (ValueError , TypeError ):
153- pass
210+ self ._apply_openalex_query_constraints (query , scielo_doc )
154211
155212 try :
156213 response = self .client .client .search (
@@ -185,7 +242,6 @@ def _search_openalex_by_isbn(
185242 scielo_doc : Dict [str , Any ],
186243 size : int = 10 ,
187244 ) -> List [Dict [str , Any ]]:
188- year = scielo_doc .get ("publication_year" )
189245 query = {
190246 "bool" : {
191247 "should" : [
@@ -201,15 +257,7 @@ def _search_openalex_by_isbn(
201257 "minimum_should_match" : 1 ,
202258 }
203259 }
204-
205- if year is not None :
206- try :
207- year = int (year )
208- query ["bool" ]["filter" ] = [
209- {"range" : {"publication_year" : {"gte" : year - 1 , "lte" : year + 1 }}}
210- ]
211- except (ValueError , TypeError ):
212- pass
260+ self ._apply_openalex_query_constraints (query , scielo_doc )
213261
214262 try :
215263 response = self .client .client .search (
@@ -227,7 +275,6 @@ def _search_openalex_by_title_year(
227275 size : int = 10 ,
228276 ) -> List [Dict [str , Any ]]:
229277 title = scielo_doc .get ("title" , "" )
230- year = scielo_doc .get ("publication_year" )
231278 issns = scielo_doc .get ("source_issns" , [])
232279 if not title :
233280 return []
@@ -247,16 +294,7 @@ def _search_openalex_by_title_year(
247294 ]
248295 }
249296 }
250-
251- if year :
252- try :
253- year_int = int (year )
254- except (TypeError , ValueError ):
255- logger .warning ("Invalid year value: %s, skipping range filter" , year )
256- return []
257- query ["bool" ]["filter" ] = [
258- {"range" : {"publication_year" : {"gte" : year_int - 1 , "lte" : year_int + 1 }}}
259- ]
297+ self ._apply_openalex_query_constraints (query , scielo_doc )
260298
261299 if issns :
262300 query ["bool" ]["should" ] = [{"terms" : {"source.issns" : issns }}]
0 commit comments