55from typing import Dict , List
66
77from quotaclimat .data_ingestion .factiva .utils_data_processing .detect_keywords import (
8- search_keywords_in_text ,
98 search_keywords_with_canonical_forms ,
109)
1110from quotaclimat .data_processing .mediatree .keyword .keyword import THEME_KEYWORDS
@@ -91,38 +90,45 @@ def find_keywords_in_text(text: str, keywords_info: List[Dict]) -> List[Dict]:
9190 """
9291 Find ALL occurrences of keywords in text (including duplicates) with their metadata.
9392
94- This function uses named capture groups to return CANONICAL forms from the dictionary,
93+ This function returns CANONICAL forms from the dictionary,
9594 ensuring that categories are always preserved even for plural forms.
9695
9796 Example:
9897 If text contains "canicules" and dictionary has {"keyword": "canicule", "category": "X"}:
99- Returns [{"keyword": "canicule", "category": "X"}] # Preserved!
98+ Returns [{"keyword": "canicule", "category": "X", "start": 0, "end": 9 }] # Preserved with positions !
10099
101100 Args:
102101 text: The text to search in
103102 keywords_info: List of dicts with 'keyword' and 'category' keys
104103
105104 Returns:
106- List of dicts with found keywords (canonical forms) and their categories (with duplicates)
105+ List of dicts with found keywords (canonical forms), categories, and positions (with duplicates)
107106 """
108107 if not text or not keywords_info :
109108 return []
110109
111110 # Extract just the keyword strings for search
112111 keyword_strings = [k ["keyword" ] for k in keywords_info ]
113112
114- # Find all occurrences using named groups (returns CANONICAL forms)
115- found_keywords = search_keywords_with_canonical_forms (text , keyword_strings , keep_duplicates = True )
113+ # Find all occurrences WITH POSITIONS using search_keywords_with_canonical_forms
114+ found_keywords_with_positions = search_keywords_with_canonical_forms (
115+ text ,
116+ keyword_strings ,
117+ keep_duplicates = True ,
118+ return_positions = True
119+ )
116120
117121 # Create a mapping from keyword to category
118122 keyword_to_category = {k ["keyword" ]: k ["category" ] for k in keywords_info }
119123
120- # Build result with categories
124+ # Build result with categories and positions
121125 result = []
122- for keyword in found_keywords :
126+ for item in found_keywords_with_positions :
123127 result .append ({
124- "keyword" : keyword ,
125- "category" : keyword_to_category .get (keyword , "" )
128+ "keyword" : item ["keyword" ],
129+ "category" : keyword_to_category .get (item ["keyword" ], "" ),
130+ "start" : item ["start" ],
131+ "end" : item ["end" ]
126132 })
127133
128134 return result
@@ -154,6 +160,62 @@ def extract_keyword_strings(keyword_list: List[Dict]) -> List[str]:
154160 return [k ["keyword" ] for k in keyword_list ]
155161
156162
163+ def filter_hrfp_overlapping_with_non_hrfp (
164+ keywords_with_positions_non_hrfp : List [Dict ],
165+ keywords_with_positions_hrfp : List [Dict ]
166+ ) -> List [Dict ]:
167+ """
168+ Filter out HRFP keywords that overlap with non-HRFP keywords.
169+
170+ When a non-HRFP keyword overlaps with an HRFP keyword (same text position),
171+ the non-HRFP has priority and the HRFP is removed.
172+
173+ Example:
174+ text = "le réchauffement climatique est un réchauffement"
175+ non_hrfp = [{"keyword": "réchauffement climatique", "start": 3, "end": 27, ...}]
176+ hrfp = [{"keyword": "réchauffement", "start": 3, "end": 16, ...}, # OVERLAPS with non-HRFP -> REMOVED
177+ {"keyword": "réchauffement", "start": 35, "end": 48, ...}] # No overlap -> KEPT
178+
179+ Result: Only the second "réchauffement" is kept (no overlap)
180+
181+ Args:
182+ keywords_with_positions_non_hrfp: List of non-HRFP keywords with positions
183+ keywords_with_positions_hrfp: List of HRFP keywords with positions
184+
185+ Returns:
186+ Filtered list of HRFP keywords (only those that don't overlap with non-HRFP)
187+ """
188+ if not keywords_with_positions_hrfp :
189+ return []
190+
191+ if not keywords_with_positions_non_hrfp :
192+ # No non-HRFP keywords, so no filtering needed
193+ return keywords_with_positions_hrfp
194+
195+ # Extract position ranges for non-HRFP keywords
196+ non_hrfp_ranges = [(kw ["start" ], kw ["end" ]) for kw in keywords_with_positions_non_hrfp ]
197+
198+ # Filter HRFP keywords: keep only those that don't overlap with any non-HRFP
199+ filtered_hrfp = []
200+ for hrfp_kw in keywords_with_positions_hrfp :
201+ hrfp_start = hrfp_kw ["start" ]
202+ hrfp_end = hrfp_kw ["end" ]
203+
204+ # Check if this HRFP keyword overlaps with any non-HRFP keyword
205+ is_overlapping = False
206+ for non_hrfp_start , non_hrfp_end in non_hrfp_ranges :
207+ # Two ranges overlap if: start < non_hrfp_end AND end > non_hrfp_start
208+ if hrfp_start < non_hrfp_end and hrfp_end > non_hrfp_start :
209+ is_overlapping = True
210+ break
211+
212+ # Keep this HRFP keyword only if it doesn't overlap with any non-HRFP
213+ if not is_overlapping :
214+ filtered_hrfp .append (hrfp_kw )
215+
216+ return filtered_hrfp
217+
218+
157219def extract_keyword_data_from_article (article_text : str ) -> Dict :
158220 """
159221 Extract keyword counts AND lists for all themes from a Factiva article.
@@ -357,12 +419,13 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
357419 "category" : kw ["category" ]
358420 })
359421
360- # ONE regex search for ALL non-HRFP keywords (with overlap filtering)
422+ # ONE regex search for ALL non-HRFP keywords (with overlap filtering + positions)
423+ found_keywords_no_hrfp_with_positions = []
361424 if all_keywords_no_hrfp :
362- found_keywords_no_hrfp = find_keywords_in_text (article_text , all_keywords_no_hrfp )
425+ found_keywords_no_hrfp_with_positions = find_keywords_in_text (article_text , all_keywords_no_hrfp )
363426
364427 # Distribute found keywords to their theme(s)
365- for kw_dict in found_keywords_no_hrfp :
428+ for kw_dict in found_keywords_no_hrfp_with_positions :
366429 keyword_str = kw_dict ["keyword" ]
367430 metadata_list = keyword_to_metadata_no_hrfp .get (keyword_str , [])
368431
@@ -384,12 +447,20 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
384447 "is_hrfp" : False
385448 })
386449
387- # ONE regex search for ALL HRFP keywords (with overlap filtering)
450+ # ONE regex search for ALL HRFP keywords (with overlap filtering + positions)
451+ # THEN filter out HRFP keywords that overlap with non-HRFP keywords
388452 if all_keywords_hrfp :
389- found_keywords_hrfp = find_keywords_in_text (article_text , all_keywords_hrfp )
453+ found_keywords_hrfp_raw = find_keywords_in_text (article_text , all_keywords_hrfp )
454+
455+ # Filter out HRFP keywords that overlap with non-HRFP keywords
456+ # Non-HRFP always has priority over HRFP when they overlap
457+ found_keywords_hrfp_filtered = filter_hrfp_overlapping_with_non_hrfp (
458+ found_keywords_no_hrfp_with_positions ,
459+ found_keywords_hrfp_raw
460+ )
390461
391462 # Distribute found keywords to their theme(s)
392- for kw_dict in found_keywords_hrfp :
463+ for kw_dict in found_keywords_hrfp_filtered :
393464 keyword_str = kw_dict ["keyword" ]
394465 metadata_list = keyword_to_metadata_hrfp .get (keyword_str , [])
395466
0 commit comments