review regex detection to avoid detection of shorted keywords within longer expressions

vmateos1 · vmateos1 · commit f69b4f4edbe8 · 2026-01-23T16:21:48.000+01:00
diff --git a/quotaclimat/data_ingestion/factiva/utils_data_processing/detect_keywords.py b/quotaclimat/data_ingestion/factiva/utils_data_processing/detect_keywords.py
@@ -30,6 +30,9 @@ def create_combined_regex_pattern(
     """
     Creates a large combined regex pattern from a list of keywords.
     Handles line start and is case-insensitive.
+    
+    Keywords are sorted by length (descending) to ensure longer 
+    keywords are matched before shorter ones that might be substrings.
 
     Args:
         keywords_filtered: List of keywords to transform into a regex
@@ -41,10 +44,14 @@ def create_combined_regex_pattern(
     if not keywords_filtered:
         return ""
 
+    # Sort keywords by length (descending) to match longest first
+    # This prevents shorter keywords from matching when they're part of longer ones
+    sorted_keywords = sorted(keywords_filtered, key=len, reverse=True)
+
     # Transform each keyword (which may contain multiple words)
     transformed_keywords = []
 
-    for keyword in keywords_filtered:
+    for keyword in sorted_keywords:
         # Split multi-word keywords and apply format_word_regex to each word
         words = keyword.split(" ")
         transformed_words = [format_word_regex(word) for word in words]
@@ -91,6 +98,8 @@ def search_keywords_in_text(
 ) -> List[str]:
     """
     Searches all keywords present in a text using a combined regex.
+    When multiple keywords overlap (e.g., "effet de serre" inside "gaz à effet de serre"),
+    only the LONGEST match is kept.
 
     Args:
         text: The text to search in
@@ -110,14 +119,48 @@ def search_keywords_in_text(
     if not pattern:
         return []
 
-    # Find all occurrences (case-insensitivity already in the pattern)
-    matches = re.findall(pattern, text)
+    # Find all matches WITH positions (using finditer instead of findall)
+    all_matches = []
+    for match_obj in re.finditer(pattern, text):
+        matched_text = match_obj.group(0)
+        start_pos = match_obj.start()
+        end_pos = match_obj.end()
+        all_matches.append({
+            'text': matched_text,
+            'start': start_pos,
+            'end': end_pos
+        })
+    
+    # Filter overlapping matches - keep only the longest ones
+    # Sort by length (descending) to process longest matches first
+    all_matches.sort(key=lambda x: x['end'] - x['start'], reverse=True)
+    
+    # Keep track of used positions to detect overlaps
+    accepted_matches = []
+    used_ranges = []
+    
+    for match in all_matches:
+        start = match['start']
+        end = match['end']
+        
+        # Check if this match overlaps with any already accepted match
+        is_overlapping = False
+        for used_start, used_end in used_ranges:
+            # Two ranges overlap if: start < used_end AND end > used_start
+            if start < used_end and end > used_start:
+                is_overlapping = True
+                break
+        
+        # If no overlap, accept this match
+        if not is_overlapping:
+            accepted_matches.append(match['text'])
+            used_ranges.append((start, end))
 
     # Return unique or all occurrences based on parameter
     if keep_duplicates:
-        return matches
+        return accepted_matches
     else:
-        return list(set(matches))
+        return list(set(accepted_matches))
 
 
 def is_keyword_in_text(keyword: str, text: str) -> bool:
@@ -223,6 +266,9 @@ def search_keywords_with_canonical_forms(
     Searches keywords in text and returns CANONICAL forms (from dictionary).
     
     Fast implementation using pre-computed variant mapping instead of named groups.
+    When multiple keywords overlap (e.g., "effet de serre" inside "gaz à effet de serre"),
+    only the LONGEST match is kept.
+    
     Performance: O(m) for regex + O(n) for lookups where n = number of matches (usually << 5900)
     
     Args:
@@ -237,6 +283,9 @@ def search_keywords_with_canonical_forms(
     Example:
         >>> search_keywords_with_canonical_forms("Les canicules augmentent", ["canicule"])
         ["canicule"]  # Returns canonical form, not "canicules"
+        
+        >>> search_keywords_with_canonical_forms("gaz à effet de serre", ["effet de serre", "gaz à effet de serre"])
+        ["gaz à effet de serre"]  # Only longest match is kept
     """
     if not keywords_filtered or not text:
         return []
@@ -250,12 +299,46 @@ def search_keywords_with_canonical_forms(
     if not pattern:
         return []
     
-    # Step 3: Find all matches (fast regex)
-    matches = re.findall(pattern, text)
+    # Step 3: Find all matches WITH positions
+    all_matches = []
+    for match_obj in re.finditer(pattern, text):
+        matched_text = match_obj.group(0)
+        start_pos = match_obj.start()
+        end_pos = match_obj.end()
+        all_matches.append({
+            'text': matched_text,
+            'start': start_pos,
+            'end': end_pos
+        })
+    
+    # Step 4: Filter overlapping matches - keep only the longest ones
+    # Sort by length (descending) to process longest matches first
+    all_matches.sort(key=lambda x: x['end'] - x['start'], reverse=True)
+    
+    # Keep track of used positions to detect overlaps
+    accepted_matches = []
+    used_ranges = []
+    
+    for match in all_matches:
+        start = match['start']
+        end = match['end']
+        
+        # Check if this match overlaps with any already accepted match
+        is_overlapping = False
+        for used_start, used_end in used_ranges:
+            # Two ranges overlap if: start < used_end AND end > used_start
+            if start < used_end and end > used_start:
+                is_overlapping = True
+                break
+        
+        # If no overlap, accept this match
+        if not is_overlapping:
+            accepted_matches.append(match['text'])
+            used_ranges.append((start, end))
     
-    # Step 4: Map each match to its canonical form (fast O(1) dict lookup)
+    # Step 5: Map each accepted match to its canonical form (fast O(1) dict lookup)
     canonical_matches = []
-    for match in matches:
+    for match in accepted_matches:
         match_lower = match.lower()
         canonical_form = variant_to_canonical.get(match_lower)
         if canonical_form:
diff --git a/quotaclimat/data_processing/factiva/s3_to_postgre/extract_keywords_factiva.py b/quotaclimat/data_processing/factiva/s3_to_postgre/extract_keywords_factiva.py
@@ -246,8 +246,7 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
         "all_keywords": [],
     }
     
-    # Map theme names to keys (using base theme names without suffixes)
-    # The get_keywords_by_theme_and_hrfp() function already combines themes with their _indirectes variants
+    # Map theme names to keys
     theme_to_keys = {
         "changement_climatique_constat": {
             "count_no_hrfp": "number_of_changement_climatique_constat_no_hrfp",
@@ -320,48 +319,107 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
     # Storage for all_keywords construction
     all_keywords_data = []
     
-    # Find keywords for each theme (both HRFP and non-HRFP)
+    # Mappings: keyword → list of (theme, category, is_hrfp)
+    keyword_to_metadata_no_hrfp = {}
+    keyword_to_metadata_hrfp = {}
+    
+    # Collect all non-HRFP keywords
+    all_keywords_no_hrfp = []
     for theme, keywords_dict in keywords_by_theme.items():
-        keys = theme_to_keys.get(theme)
-        if not keys:
-            logging.debug(f"Skipping theme {theme} - no mapping found")
-            continue
-        
-        # Process non-HRFP keywords
         if keywords_dict.get("non_hrfp"):
-            found_keywords = find_keywords_in_text(article_text, keywords_dict["non_hrfp"])
-            result[keys["list_no_hrfp"]] = extract_keyword_strings(found_keywords)
-            result[keys["count_no_hrfp"]] = count_unique_keywords(found_keywords)
-            
-            # Add to all_keywords_data
-            for kw_dict in found_keywords:
-                all_keywords_data.append({
-                    "keyword": kw_dict["keyword"],
+            for kw in keywords_dict["non_hrfp"]:
+                keyword_str = kw["keyword"]
+                
+                # A keyword can belong to multiple themes - store all mappings
+                if keyword_str not in keyword_to_metadata_no_hrfp:
+                    keyword_to_metadata_no_hrfp[keyword_str] = []
+                    all_keywords_no_hrfp.append(kw)
+                
+                keyword_to_metadata_no_hrfp[keyword_str].append({
                     "theme": theme,
-                    "category": kw_dict["category"],
-                    "is_hrfp": False
+                    "category": kw["category"]
                 })
-            
-            if result[keys["count_no_hrfp"]] > 0:
-                logging.debug(f"Found {result[keys['count_no_hrfp']]} unique non-HRFP keywords for theme {theme}")
-        
-        # Process HRFP keywords
+    
+    # Collect all HRFP keywords
+    all_keywords_hrfp = []
+    for theme, keywords_dict in keywords_by_theme.items():
         if keywords_dict.get("hrfp"):
-            found_keywords = find_keywords_in_text(article_text, keywords_dict["hrfp"])
-            result[keys["list_hrfp"]] = extract_keyword_strings(found_keywords)
-            result[keys["count_hrfp"]] = count_unique_keywords(found_keywords)
-            
-            # Add to all_keywords_data
-            for kw_dict in found_keywords:
-                all_keywords_data.append({
-                    "keyword": kw_dict["keyword"],
+            for kw in keywords_dict["hrfp"]:
+                keyword_str = kw["keyword"]
+                
+                # A keyword can belong to multiple themes - store all mappings
+                if keyword_str not in keyword_to_metadata_hrfp:
+                    keyword_to_metadata_hrfp[keyword_str] = []
+                    all_keywords_hrfp.append(kw)  # Add to search list (once)
+                
+                keyword_to_metadata_hrfp[keyword_str].append({
                     "theme": theme,
-                    "category": kw_dict["category"],
-                    "is_hrfp": True
+                    "category": kw["category"]
                 })
+    
+    # ONE regex search for ALL non-HRFP keywords (with overlap filtering)
+    if all_keywords_no_hrfp:
+        found_keywords_no_hrfp = find_keywords_in_text(article_text, all_keywords_no_hrfp)
+        
+        # Distribute found keywords to their theme(s)
+        for kw_dict in found_keywords_no_hrfp:
+            keyword_str = kw_dict["keyword"]
+            metadata_list = keyword_to_metadata_no_hrfp.get(keyword_str, [])
             
-            if result[keys["count_hrfp"]] > 0:
-                logging.debug(f"Found {result[keys['count_hrfp']]} unique HRFP keywords for theme {theme}")
+            # Add this keyword to ALL themes it belongs to
+            for metadata in metadata_list:
+                theme = metadata["theme"]
+                category = metadata["category"]
+                
+                keys = theme_to_keys.get(theme)
+                if keys:
+                    # Add to keyword list for this theme
+                    result[keys["list_no_hrfp"]].append(keyword_str)
+                    
+                    # Add to all_keywords_data
+                    all_keywords_data.append({
+                        "keyword": keyword_str,
+                        "theme": theme,
+                        "category": category,
+                        "is_hrfp": False
+                    })
+    
+    # ONE regex search for ALL HRFP keywords (with overlap filtering)
+    if all_keywords_hrfp:
+        found_keywords_hrfp = find_keywords_in_text(article_text, all_keywords_hrfp)
+        
+        # Distribute found keywords to their theme(s)
+        for kw_dict in found_keywords_hrfp:
+            keyword_str = kw_dict["keyword"]
+            metadata_list = keyword_to_metadata_hrfp.get(keyword_str, [])
+            
+            # Add this keyword to ALL themes it belongs to
+            for metadata in metadata_list:
+                theme = metadata["theme"]
+                category = metadata["category"]
+                
+                keys = theme_to_keys.get(theme)
+                if keys:
+                    # Add to keyword list for this theme
+                    result[keys["list_hrfp"]].append(keyword_str)
+                    
+                    # Add to all_keywords_data
+                    all_keywords_data.append({
+                        "keyword": keyword_str,
+                        "theme": theme,
+                        "category": category,
+                        "is_hrfp": True
+                    })
+    
+    # Calculate counts for each theme (unique keywords)
+    for theme, keys in theme_to_keys.items():
+        result[keys["count_no_hrfp"]] = len(set(result[keys["list_no_hrfp"]]))
+        result[keys["count_hrfp"]] = len(set(result[keys["list_hrfp"]]))
+        
+        if result[keys["count_no_hrfp"]] > 0:
+            logging.debug(f"Found {result[keys['count_no_hrfp']]} unique non-HRFP keywords for theme {theme}")
+        if result[keys["count_hrfp"]] > 0:
+            logging.debug(f"Found {result[keys['count_hrfp']]} unique HRFP keywords for theme {theme}")
     
     # Calculate combined climate solutions (attenuation + adaptation)
     # Non-HRFP