Skip to content

Commit d26d13f

Browse files
committed
review detection logic to avoid overlap between non hrfp and hrfp keywords
1 parent 1f8cca5 commit d26d13f

File tree

2 files changed

+126
-25
lines changed

2 files changed

+126
-25
lines changed

quotaclimat/data_ingestion/factiva/utils_data_processing/detect_keywords.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import re
22
from itertools import product
3-
from typing import Dict, List, Tuple
3+
from typing import Dict, List
44

55

66
def format_word_regex(word: str) -> str:
@@ -260,8 +260,8 @@ def create_variant_to_canonical_mapping(keywords_filtered: List[str]) -> Dict[st
260260

261261

262262
def search_keywords_with_canonical_forms(
263-
text: str, keywords_filtered: List[str], keep_duplicates: bool = False
264-
) -> List[str]:
263+
text: str, keywords_filtered: List[str], keep_duplicates: bool = False, return_positions: bool = False
264+
) -> List:
265265
"""
266266
Searches keywords in text and returns CANONICAL forms (from dictionary).
267267
@@ -276,16 +276,22 @@ def search_keywords_with_canonical_forms(
276276
keywords_filtered: List of canonical keywords from the dictionary
277277
keep_duplicates: If True, returns all occurrences (with duplicates).
278278
If False, returns unique keywords only (default).
279+
return_positions: If True, returns list of dicts with positions [{"keyword": "...", "start": X, "end": Y}, ...].
280+
If False, returns list of strings ["keyword1", "keyword2", ...] (default, backward compatible).
279281
280282
Returns:
281-
List of CANONICAL keywords found (matching the dictionary forms)
283+
If return_positions=False: List of CANONICAL keyword strings (default, backward compatible)
284+
If return_positions=True: List of dicts with structure [{"keyword": "...", "start": X, "end": Y}, ...]
282285
283286
Example:
284287
>>> search_keywords_with_canonical_forms("Les canicules augmentent", ["canicule"])
285288
["canicule"] # Returns canonical form, not "canicules"
286289
287290
>>> search_keywords_with_canonical_forms("gaz à effet de serre", ["effet de serre", "gaz à effet de serre"])
288291
["gaz à effet de serre"] # Only longest match is kept
292+
293+
>>> search_keywords_with_canonical_forms("Les canicules augmentent", ["canicule"], return_positions=True)
294+
[{"keyword": "canicule", "start": 4, "end": 13}] # With positions
289295
"""
290296
if not keywords_filtered or not text:
291297
return []
@@ -333,22 +339,46 @@ def search_keywords_with_canonical_forms(
333339

334340
# If no overlap, accept this match
335341
if not is_overlapping:
336-
accepted_matches.append(match['text'])
342+
accepted_matches.append(match)
337343
used_ranges.append((start, end))
338344

339345
# Step 5: Map each accepted match to its canonical form (fast O(1) dict lookup)
340346
canonical_matches = []
341347
for match in accepted_matches:
342-
match_lower = match.lower()
348+
match_lower = match['text'].lower()
343349
canonical_form = variant_to_canonical.get(match_lower)
344350
if canonical_form:
345-
canonical_matches.append(canonical_form)
351+
if return_positions:
352+
canonical_matches.append({
353+
"keyword": canonical_form,
354+
"start": match['start'],
355+
"end": match['end']
356+
})
357+
else:
358+
canonical_matches.append(canonical_form)
346359
else:
347360
# Fallback: keep original if no mapping found
348-
canonical_matches.append(match)
361+
if return_positions:
362+
canonical_matches.append({
363+
"keyword": match['text'],
364+
"start": match['start'],
365+
"end": match['end']
366+
})
367+
else:
368+
canonical_matches.append(match['text'])
349369

350370
# Return unique or all occurrences based on parameter
351371
if keep_duplicates:
352372
return canonical_matches
353373
else:
354-
return list(set(canonical_matches))
374+
if return_positions:
375+
# Remove duplicates based on keyword (keep first occurrence)
376+
seen_keywords = set()
377+
unique_matches = []
378+
for match in canonical_matches:
379+
if match["keyword"] not in seen_keywords:
380+
seen_keywords.add(match["keyword"])
381+
unique_matches.append(match)
382+
return unique_matches
383+
else:
384+
return list(set(canonical_matches))

quotaclimat/data_processing/factiva/s3_to_postgre/extract_keywords_factiva.py

Lines changed: 87 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from typing import Dict, List
66

77
from quotaclimat.data_ingestion.factiva.utils_data_processing.detect_keywords import (
8-
search_keywords_in_text,
98
search_keywords_with_canonical_forms,
109
)
1110
from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS
@@ -91,38 +90,45 @@ def find_keywords_in_text(text: str, keywords_info: List[Dict]) -> List[Dict]:
9190
"""
9291
Find ALL occurrences of keywords in text (including duplicates) with their metadata.
9392
94-
This function uses named capture groups to return CANONICAL forms from the dictionary,
93+
This function returns CANONICAL forms from the dictionary,
9594
ensuring that categories are always preserved even for plural forms.
9695
9796
Example:
9897
If text contains "canicules" and dictionary has {"keyword": "canicule", "category": "X"}:
99-
Returns [{"keyword": "canicule", "category": "X"}] # Preserved!
98+
Returns [{"keyword": "canicule", "category": "X", "start": 0, "end": 9}] # Preserved with positions!
10099
101100
Args:
102101
text: The text to search in
103102
keywords_info: List of dicts with 'keyword' and 'category' keys
104103
105104
Returns:
106-
List of dicts with found keywords (canonical forms) and their categories (with duplicates)
105+
List of dicts with found keywords (canonical forms), categories, and positions (with duplicates)
107106
"""
108107
if not text or not keywords_info:
109108
return []
110109

111110
# Extract just the keyword strings for search
112111
keyword_strings = [k["keyword"] for k in keywords_info]
113112

114-
# Find all occurrences using named groups (returns CANONICAL forms)
115-
found_keywords = search_keywords_with_canonical_forms(text, keyword_strings, keep_duplicates=True)
113+
# Find all occurrences WITH POSITIONS using search_keywords_with_canonical_forms
114+
found_keywords_with_positions = search_keywords_with_canonical_forms(
115+
text,
116+
keyword_strings,
117+
keep_duplicates=True,
118+
return_positions=True
119+
)
116120

117121
# Create a mapping from keyword to category
118122
keyword_to_category = {k["keyword"]: k["category"] for k in keywords_info}
119123

120-
# Build result with categories
124+
# Build result with categories and positions
121125
result = []
122-
for keyword in found_keywords:
126+
for item in found_keywords_with_positions:
123127
result.append({
124-
"keyword": keyword,
125-
"category": keyword_to_category.get(keyword, "")
128+
"keyword": item["keyword"],
129+
"category": keyword_to_category.get(item["keyword"], ""),
130+
"start": item["start"],
131+
"end": item["end"]
126132
})
127133

128134
return result
@@ -154,6 +160,62 @@ def extract_keyword_strings(keyword_list: List[Dict]) -> List[str]:
154160
return [k["keyword"] for k in keyword_list]
155161

156162

163+
def filter_hrfp_overlapping_with_non_hrfp(
164+
keywords_with_positions_non_hrfp: List[Dict],
165+
keywords_with_positions_hrfp: List[Dict]
166+
) -> List[Dict]:
167+
"""
168+
Filter out HRFP keywords that overlap with non-HRFP keywords.
169+
170+
When a non-HRFP keyword overlaps with an HRFP keyword (same text position),
171+
the non-HRFP has priority and the HRFP is removed.
172+
173+
Example:
174+
text = "le réchauffement climatique est un réchauffement"
175+
non_hrfp = [{"keyword": "réchauffement climatique", "start": 3, "end": 27, ...}]
176+
hrfp = [{"keyword": "réchauffement", "start": 3, "end": 16, ...}, # OVERLAPS with non-HRFP -> REMOVED
177+
{"keyword": "réchauffement", "start": 35, "end": 48, ...}] # No overlap -> KEPT
178+
179+
Result: Only the second "réchauffement" is kept (no overlap)
180+
181+
Args:
182+
keywords_with_positions_non_hrfp: List of non-HRFP keywords with positions
183+
keywords_with_positions_hrfp: List of HRFP keywords with positions
184+
185+
Returns:
186+
Filtered list of HRFP keywords (only those that don't overlap with non-HRFP)
187+
"""
188+
if not keywords_with_positions_hrfp:
189+
return []
190+
191+
if not keywords_with_positions_non_hrfp:
192+
# No non-HRFP keywords, so no filtering needed
193+
return keywords_with_positions_hrfp
194+
195+
# Extract position ranges for non-HRFP keywords
196+
non_hrfp_ranges = [(kw["start"], kw["end"]) for kw in keywords_with_positions_non_hrfp]
197+
198+
# Filter HRFP keywords: keep only those that don't overlap with any non-HRFP
199+
filtered_hrfp = []
200+
for hrfp_kw in keywords_with_positions_hrfp:
201+
hrfp_start = hrfp_kw["start"]
202+
hrfp_end = hrfp_kw["end"]
203+
204+
# Check if this HRFP keyword overlaps with any non-HRFP keyword
205+
is_overlapping = False
206+
for non_hrfp_start, non_hrfp_end in non_hrfp_ranges:
207+
# Two ranges overlap if: start < non_hrfp_end AND end > non_hrfp_start
208+
if hrfp_start < non_hrfp_end and hrfp_end > non_hrfp_start:
209+
is_overlapping = True
210+
break
211+
212+
# Keep this HRFP keyword only if it doesn't overlap with any non-HRFP
213+
if not is_overlapping:
214+
filtered_hrfp.append(hrfp_kw)
215+
216+
return filtered_hrfp
217+
218+
157219
def extract_keyword_data_from_article(article_text: str) -> Dict:
158220
"""
159221
Extract keyword counts AND lists for all themes from a Factiva article.
@@ -357,12 +419,13 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
357419
"category": kw["category"]
358420
})
359421

360-
# ONE regex search for ALL non-HRFP keywords (with overlap filtering)
422+
# ONE regex search for ALL non-HRFP keywords (with overlap filtering + positions)
423+
found_keywords_no_hrfp_with_positions = []
361424
if all_keywords_no_hrfp:
362-
found_keywords_no_hrfp = find_keywords_in_text(article_text, all_keywords_no_hrfp)
425+
found_keywords_no_hrfp_with_positions = find_keywords_in_text(article_text, all_keywords_no_hrfp)
363426

364427
# Distribute found keywords to their theme(s)
365-
for kw_dict in found_keywords_no_hrfp:
428+
for kw_dict in found_keywords_no_hrfp_with_positions:
366429
keyword_str = kw_dict["keyword"]
367430
metadata_list = keyword_to_metadata_no_hrfp.get(keyword_str, [])
368431

@@ -384,12 +447,20 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
384447
"is_hrfp": False
385448
})
386449

387-
# ONE regex search for ALL HRFP keywords (with overlap filtering)
450+
# ONE regex search for ALL HRFP keywords (with overlap filtering + positions)
451+
# THEN filter out HRFP keywords that overlap with non-HRFP keywords
388452
if all_keywords_hrfp:
389-
found_keywords_hrfp = find_keywords_in_text(article_text, all_keywords_hrfp)
453+
found_keywords_hrfp_raw = find_keywords_in_text(article_text, all_keywords_hrfp)
454+
455+
# Filter out HRFP keywords that overlap with non-HRFP keywords
456+
# Non-HRFP always has priority over HRFP when they overlap
457+
found_keywords_hrfp_filtered = filter_hrfp_overlapping_with_non_hrfp(
458+
found_keywords_no_hrfp_with_positions,
459+
found_keywords_hrfp_raw
460+
)
390461

391462
# Distribute found keywords to their theme(s)
392-
for kw_dict in found_keywords_hrfp:
463+
for kw_dict in found_keywords_hrfp_filtered:
393464
keyword_str = kw_dict["keyword"]
394465
metadata_list = keyword_to_metadata_hrfp.get(keyword_str, [])
395466

0 commit comments

Comments
 (0)