review detection logic to avoid overlap between non hrfp and hrfp keywords

vmateos1 · vmateos1 · commit d26d13f2776e · 2026-02-03T15:33:48.000+01:00
diff --git a/quotaclimat/data_ingestion/factiva/utils_data_processing/detect_keywords.py b/quotaclimat/data_ingestion/factiva/utils_data_processing/detect_keywords.py
@@ -1,6 +1,6 @@
 import re
 from itertools import product
-from typing import Dict, List, Tuple
+from typing import Dict, List
 
 
 def format_word_regex(word: str) -> str:
@@ -260,8 +260,8 @@ def create_variant_to_canonical_mapping(keywords_filtered: List[str]) -> Dict[st
 
 
 def search_keywords_with_canonical_forms(
-    text: str, keywords_filtered: List[str], keep_duplicates: bool = False
-) -> List[str]:
+    text: str, keywords_filtered: List[str], keep_duplicates: bool = False, return_positions: bool = False
+) -> List:
     """
     Searches keywords in text and returns CANONICAL forms (from dictionary).
     
@@ -276,16 +276,22 @@ def search_keywords_with_canonical_forms(
         keywords_filtered: List of canonical keywords from the dictionary
         keep_duplicates: If True, returns all occurrences (with duplicates).
                         If False, returns unique keywords only (default).
+        return_positions: If True, returns list of dicts with positions [{"keyword": "...", "start": X, "end": Y}, ...].
+                         If False, returns list of strings ["keyword1", "keyword2", ...] (default, backward compatible).
     
     Returns:
-        List of CANONICAL keywords found (matching the dictionary forms)
+        If return_positions=False: List of CANONICAL keyword strings (default, backward compatible)
+        If return_positions=True: List of dicts with structure [{"keyword": "...", "start": X, "end": Y}, ...]
     
     Example:
         >>> search_keywords_with_canonical_forms("Les canicules augmentent", ["canicule"])
         ["canicule"]  # Returns canonical form, not "canicules"
         
         >>> search_keywords_with_canonical_forms("gaz à effet de serre", ["effet de serre", "gaz à effet de serre"])
         ["gaz à effet de serre"]  # Only longest match is kept
+        
+        >>> search_keywords_with_canonical_forms("Les canicules augmentent", ["canicule"], return_positions=True)
+        [{"keyword": "canicule", "start": 4, "end": 13}]  # With positions
     """
     if not keywords_filtered or not text:
         return []
@@ -333,22 +339,46 @@ def search_keywords_with_canonical_forms(
         
         # If no overlap, accept this match
         if not is_overlapping:
-            accepted_matches.append(match['text'])
+            accepted_matches.append(match)
             used_ranges.append((start, end))
     
     # Step 5: Map each accepted match to its canonical form (fast O(1) dict lookup)
     canonical_matches = []
     for match in accepted_matches:
-        match_lower = match.lower()
+        match_lower = match['text'].lower()
         canonical_form = variant_to_canonical.get(match_lower)
         if canonical_form:
-            canonical_matches.append(canonical_form)
+            if return_positions:
+                canonical_matches.append({
+                    "keyword": canonical_form,
+                    "start": match['start'],
+                    "end": match['end']
+                })
+            else:
+                canonical_matches.append(canonical_form)
         else:
             # Fallback: keep original if no mapping found
-            canonical_matches.append(match)
+            if return_positions:
+                canonical_matches.append({
+                    "keyword": match['text'],
+                    "start": match['start'],
+                    "end": match['end']
+                })
+            else:
+                canonical_matches.append(match['text'])
     
     # Return unique or all occurrences based on parameter
     if keep_duplicates:
         return canonical_matches
     else:
-        return list(set(canonical_matches))
+        if return_positions:
+            # Remove duplicates based on keyword (keep first occurrence)
+            seen_keywords = set()
+            unique_matches = []
+            for match in canonical_matches:
+                if match["keyword"] not in seen_keywords:
+                    seen_keywords.add(match["keyword"])
+                    unique_matches.append(match)
+            return unique_matches
+        else:
+            return list(set(canonical_matches))
diff --git a/quotaclimat/data_processing/factiva/s3_to_postgre/extract_keywords_factiva.py b/quotaclimat/data_processing/factiva/s3_to_postgre/extract_keywords_factiva.py
@@ -5,7 +5,6 @@
 from typing import Dict, List
 
 from quotaclimat.data_ingestion.factiva.utils_data_processing.detect_keywords import (
-    search_keywords_in_text,
     search_keywords_with_canonical_forms,
 )
 from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS
@@ -91,38 +90,45 @@ def find_keywords_in_text(text: str, keywords_info: List[Dict]) -> List[Dict]:
     """
     Find ALL occurrences of keywords in text (including duplicates) with their metadata.
     
-    This function uses named capture groups to return CANONICAL forms from the dictionary,
+    This function returns CANONICAL forms from the dictionary,
     ensuring that categories are always preserved even for plural forms.
     
     Example:
         If text contains "canicules" and dictionary has {"keyword": "canicule", "category": "X"}:
-        Returns [{"keyword": "canicule", "category": "X"}]  # Preserved!
+        Returns [{"keyword": "canicule", "category": "X", "start": 0, "end": 9}]  # Preserved with positions!
     
     Args:
         text: The text to search in
         keywords_info: List of dicts with 'keyword' and 'category' keys
         
     Returns:
-        List of dicts with found keywords (canonical forms) and their categories (with duplicates)
+        List of dicts with found keywords (canonical forms), categories, and positions (with duplicates)
     """
     if not text or not keywords_info:
         return []
     
     # Extract just the keyword strings for search
     keyword_strings = [k["keyword"] for k in keywords_info]
     
-    # Find all occurrences using named groups (returns CANONICAL forms)
-    found_keywords = search_keywords_with_canonical_forms(text, keyword_strings, keep_duplicates=True)
+    # Find all occurrences WITH POSITIONS using search_keywords_with_canonical_forms
+    found_keywords_with_positions = search_keywords_with_canonical_forms(
+        text, 
+        keyword_strings, 
+        keep_duplicates=True,
+        return_positions=True
+    )
     
     # Create a mapping from keyword to category
     keyword_to_category = {k["keyword"]: k["category"] for k in keywords_info}
     
-    # Build result with categories
+    # Build result with categories and positions
     result = []
-    for keyword in found_keywords:
+    for item in found_keywords_with_positions:
         result.append({
-            "keyword": keyword,
-            "category": keyword_to_category.get(keyword, "")
+            "keyword": item["keyword"],
+            "category": keyword_to_category.get(item["keyword"], ""),
+            "start": item["start"],
+            "end": item["end"]
         })
     
     return result
@@ -154,6 +160,62 @@ def extract_keyword_strings(keyword_list: List[Dict]) -> List[str]:
     return [k["keyword"] for k in keyword_list]
 
 
+def filter_hrfp_overlapping_with_non_hrfp(
+    keywords_with_positions_non_hrfp: List[Dict],
+    keywords_with_positions_hrfp: List[Dict]
+) -> List[Dict]:
+    """
+    Filter out HRFP keywords that overlap with non-HRFP keywords.
+    
+    When a non-HRFP keyword overlaps with an HRFP keyword (same text position),
+    the non-HRFP has priority and the HRFP is removed.
+    
+    Example:
+        text = "le réchauffement climatique est un réchauffement"
+        non_hrfp = [{"keyword": "réchauffement climatique", "start": 3, "end": 27, ...}]
+        hrfp = [{"keyword": "réchauffement", "start": 3, "end": 16, ...},  # OVERLAPS with non-HRFP -> REMOVED
+                {"keyword": "réchauffement", "start": 35, "end": 48, ...}]  # No overlap -> KEPT
+        
+        Result: Only the second "réchauffement" is kept (no overlap)
+    
+    Args:
+        keywords_with_positions_non_hrfp: List of non-HRFP keywords with positions
+        keywords_with_positions_hrfp: List of HRFP keywords with positions
+        
+    Returns:
+        Filtered list of HRFP keywords (only those that don't overlap with non-HRFP)
+    """
+    if not keywords_with_positions_hrfp:
+        return []
+    
+    if not keywords_with_positions_non_hrfp:
+        # No non-HRFP keywords, so no filtering needed
+        return keywords_with_positions_hrfp
+    
+    # Extract position ranges for non-HRFP keywords
+    non_hrfp_ranges = [(kw["start"], kw["end"]) for kw in keywords_with_positions_non_hrfp]
+    
+    # Filter HRFP keywords: keep only those that don't overlap with any non-HRFP
+    filtered_hrfp = []
+    for hrfp_kw in keywords_with_positions_hrfp:
+        hrfp_start = hrfp_kw["start"]
+        hrfp_end = hrfp_kw["end"]
+        
+        # Check if this HRFP keyword overlaps with any non-HRFP keyword
+        is_overlapping = False
+        for non_hrfp_start, non_hrfp_end in non_hrfp_ranges:
+            # Two ranges overlap if: start < non_hrfp_end AND end > non_hrfp_start
+            if hrfp_start < non_hrfp_end and hrfp_end > non_hrfp_start:
+                is_overlapping = True
+                break
+        
+        # Keep this HRFP keyword only if it doesn't overlap with any non-HRFP
+        if not is_overlapping:
+            filtered_hrfp.append(hrfp_kw)
+    
+    return filtered_hrfp
+
+
 def extract_keyword_data_from_article(article_text: str) -> Dict:
     """
     Extract keyword counts AND lists for all themes from a Factiva article.
@@ -357,12 +419,13 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
                     "category": kw["category"]
                 })
     
-    # ONE regex search for ALL non-HRFP keywords (with overlap filtering)
+    # ONE regex search for ALL non-HRFP keywords (with overlap filtering + positions)
+    found_keywords_no_hrfp_with_positions = []
     if all_keywords_no_hrfp:
-        found_keywords_no_hrfp = find_keywords_in_text(article_text, all_keywords_no_hrfp)
+        found_keywords_no_hrfp_with_positions = find_keywords_in_text(article_text, all_keywords_no_hrfp)
         
         # Distribute found keywords to their theme(s)
-        for kw_dict in found_keywords_no_hrfp:
+        for kw_dict in found_keywords_no_hrfp_with_positions:
             keyword_str = kw_dict["keyword"]
             metadata_list = keyword_to_metadata_no_hrfp.get(keyword_str, [])
             
@@ -384,12 +447,20 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
                         "is_hrfp": False
                     })
     
-    # ONE regex search for ALL HRFP keywords (with overlap filtering)
+    # ONE regex search for ALL HRFP keywords (with overlap filtering + positions)
+    # THEN filter out HRFP keywords that overlap with non-HRFP keywords
     if all_keywords_hrfp:
-        found_keywords_hrfp = find_keywords_in_text(article_text, all_keywords_hrfp)
+        found_keywords_hrfp_raw = find_keywords_in_text(article_text, all_keywords_hrfp)
+        
+        # Filter out HRFP keywords that overlap with non-HRFP keywords
+        # Non-HRFP always has priority over HRFP when they overlap
+        found_keywords_hrfp_filtered = filter_hrfp_overlapping_with_non_hrfp(
+            found_keywords_no_hrfp_with_positions,
+            found_keywords_hrfp_raw
+        )
         
         # Distribute found keywords to their theme(s)
-        for kw_dict in found_keywords_hrfp:
+        for kw_dict in found_keywords_hrfp_filtered:
             keyword_str = kw_dict["keyword"]
             metadata_list = keyword_to_metadata_hrfp.get(keyword_str, [])