Skip to content

Commit f69b4f4

Browse files
committed
review regex detection to avoid detection of shorted keywords within longer expressions
1 parent 5687b7e commit f69b4f4

File tree

2 files changed

+186
-45
lines changed

2 files changed

+186
-45
lines changed

quotaclimat/data_ingestion/factiva/utils_data_processing/detect_keywords.py

Lines changed: 92 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ def create_combined_regex_pattern(
3030
"""
3131
Creates a large combined regex pattern from a list of keywords.
3232
Handles line start and is case-insensitive.
33+
34+
Keywords are sorted by length (descending) to ensure longer
35+
keywords are matched before shorter ones that might be substrings.
3336
3437
Args:
3538
keywords_filtered: List of keywords to transform into a regex
@@ -41,10 +44,14 @@ def create_combined_regex_pattern(
4144
if not keywords_filtered:
4245
return ""
4346

47+
# Sort keywords by length (descending) to match longest first
48+
# This prevents shorter keywords from matching when they're part of longer ones
49+
sorted_keywords = sorted(keywords_filtered, key=len, reverse=True)
50+
4451
# Transform each keyword (which may contain multiple words)
4552
transformed_keywords = []
4653

47-
for keyword in keywords_filtered:
54+
for keyword in sorted_keywords:
4855
# Split multi-word keywords and apply format_word_regex to each word
4956
words = keyword.split(" ")
5057
transformed_words = [format_word_regex(word) for word in words]
@@ -91,6 +98,8 @@ def search_keywords_in_text(
9198
) -> List[str]:
9299
"""
93100
Searches all keywords present in a text using a combined regex.
101+
When multiple keywords overlap (e.g., "effet de serre" inside "gaz à effet de serre"),
102+
only the LONGEST match is kept.
94103
95104
Args:
96105
text: The text to search in
@@ -110,14 +119,48 @@ def search_keywords_in_text(
110119
if not pattern:
111120
return []
112121

113-
# Find all occurrences (case-insensitivity already in the pattern)
114-
matches = re.findall(pattern, text)
122+
# Find all matches WITH positions (using finditer instead of findall)
123+
all_matches = []
124+
for match_obj in re.finditer(pattern, text):
125+
matched_text = match_obj.group(0)
126+
start_pos = match_obj.start()
127+
end_pos = match_obj.end()
128+
all_matches.append({
129+
'text': matched_text,
130+
'start': start_pos,
131+
'end': end_pos
132+
})
133+
134+
# Filter overlapping matches - keep only the longest ones
135+
# Sort by length (descending) to process longest matches first
136+
all_matches.sort(key=lambda x: x['end'] - x['start'], reverse=True)
137+
138+
# Keep track of used positions to detect overlaps
139+
accepted_matches = []
140+
used_ranges = []
141+
142+
for match in all_matches:
143+
start = match['start']
144+
end = match['end']
145+
146+
# Check if this match overlaps with any already accepted match
147+
is_overlapping = False
148+
for used_start, used_end in used_ranges:
149+
# Two ranges overlap if: start < used_end AND end > used_start
150+
if start < used_end and end > used_start:
151+
is_overlapping = True
152+
break
153+
154+
# If no overlap, accept this match
155+
if not is_overlapping:
156+
accepted_matches.append(match['text'])
157+
used_ranges.append((start, end))
115158

116159
# Return unique or all occurrences based on parameter
117160
if keep_duplicates:
118-
return matches
161+
return accepted_matches
119162
else:
120-
return list(set(matches))
163+
return list(set(accepted_matches))
121164

122165

123166
def is_keyword_in_text(keyword: str, text: str) -> bool:
@@ -223,6 +266,9 @@ def search_keywords_with_canonical_forms(
223266
Searches keywords in text and returns CANONICAL forms (from dictionary).
224267
225268
Fast implementation using pre-computed variant mapping instead of named groups.
269+
When multiple keywords overlap (e.g., "effet de serre" inside "gaz à effet de serre"),
270+
only the LONGEST match is kept.
271+
226272
Performance: O(m) for regex + O(n) for lookups where n = number of matches (usually << 5900)
227273
228274
Args:
@@ -237,6 +283,9 @@ def search_keywords_with_canonical_forms(
237283
Example:
238284
>>> search_keywords_with_canonical_forms("Les canicules augmentent", ["canicule"])
239285
["canicule"] # Returns canonical form, not "canicules"
286+
287+
>>> search_keywords_with_canonical_forms("gaz à effet de serre", ["effet de serre", "gaz à effet de serre"])
288+
["gaz à effet de serre"] # Only longest match is kept
240289
"""
241290
if not keywords_filtered or not text:
242291
return []
@@ -250,12 +299,46 @@ def search_keywords_with_canonical_forms(
250299
if not pattern:
251300
return []
252301

253-
# Step 3: Find all matches (fast regex)
254-
matches = re.findall(pattern, text)
302+
# Step 3: Find all matches WITH positions
303+
all_matches = []
304+
for match_obj in re.finditer(pattern, text):
305+
matched_text = match_obj.group(0)
306+
start_pos = match_obj.start()
307+
end_pos = match_obj.end()
308+
all_matches.append({
309+
'text': matched_text,
310+
'start': start_pos,
311+
'end': end_pos
312+
})
313+
314+
# Step 4: Filter overlapping matches - keep only the longest ones
315+
# Sort by length (descending) to process longest matches first
316+
all_matches.sort(key=lambda x: x['end'] - x['start'], reverse=True)
317+
318+
# Keep track of used positions to detect overlaps
319+
accepted_matches = []
320+
used_ranges = []
321+
322+
for match in all_matches:
323+
start = match['start']
324+
end = match['end']
325+
326+
# Check if this match overlaps with any already accepted match
327+
is_overlapping = False
328+
for used_start, used_end in used_ranges:
329+
# Two ranges overlap if: start < used_end AND end > used_start
330+
if start < used_end and end > used_start:
331+
is_overlapping = True
332+
break
333+
334+
# If no overlap, accept this match
335+
if not is_overlapping:
336+
accepted_matches.append(match['text'])
337+
used_ranges.append((start, end))
255338

256-
# Step 4: Map each match to its canonical form (fast O(1) dict lookup)
339+
# Step 5: Map each accepted match to its canonical form (fast O(1) dict lookup)
257340
canonical_matches = []
258-
for match in matches:
341+
for match in accepted_matches:
259342
match_lower = match.lower()
260343
canonical_form = variant_to_canonical.get(match_lower)
261344
if canonical_form:

quotaclimat/data_processing/factiva/s3_to_postgre/extract_keywords_factiva.py

Lines changed: 94 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,7 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
246246
"all_keywords": [],
247247
}
248248

249-
# Map theme names to keys (using base theme names without suffixes)
250-
# The get_keywords_by_theme_and_hrfp() function already combines themes with their _indirectes variants
249+
# Map theme names to keys
251250
theme_to_keys = {
252251
"changement_climatique_constat": {
253252
"count_no_hrfp": "number_of_changement_climatique_constat_no_hrfp",
@@ -320,48 +319,107 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
320319
# Storage for all_keywords construction
321320
all_keywords_data = []
322321

323-
# Find keywords for each theme (both HRFP and non-HRFP)
322+
# Mappings: keyword → list of (theme, category, is_hrfp)
323+
keyword_to_metadata_no_hrfp = {}
324+
keyword_to_metadata_hrfp = {}
325+
326+
# Collect all non-HRFP keywords
327+
all_keywords_no_hrfp = []
324328
for theme, keywords_dict in keywords_by_theme.items():
325-
keys = theme_to_keys.get(theme)
326-
if not keys:
327-
logging.debug(f"Skipping theme {theme} - no mapping found")
328-
continue
329-
330-
# Process non-HRFP keywords
331329
if keywords_dict.get("non_hrfp"):
332-
found_keywords = find_keywords_in_text(article_text, keywords_dict["non_hrfp"])
333-
result[keys["list_no_hrfp"]] = extract_keyword_strings(found_keywords)
334-
result[keys["count_no_hrfp"]] = count_unique_keywords(found_keywords)
335-
336-
# Add to all_keywords_data
337-
for kw_dict in found_keywords:
338-
all_keywords_data.append({
339-
"keyword": kw_dict["keyword"],
330+
for kw in keywords_dict["non_hrfp"]:
331+
keyword_str = kw["keyword"]
332+
333+
# A keyword can belong to multiple themes - store all mappings
334+
if keyword_str not in keyword_to_metadata_no_hrfp:
335+
keyword_to_metadata_no_hrfp[keyword_str] = []
336+
all_keywords_no_hrfp.append(kw)
337+
338+
keyword_to_metadata_no_hrfp[keyword_str].append({
340339
"theme": theme,
341-
"category": kw_dict["category"],
342-
"is_hrfp": False
340+
"category": kw["category"]
343341
})
344-
345-
if result[keys["count_no_hrfp"]] > 0:
346-
logging.debug(f"Found {result[keys['count_no_hrfp']]} unique non-HRFP keywords for theme {theme}")
347-
348-
# Process HRFP keywords
342+
343+
# Collect all HRFP keywords
344+
all_keywords_hrfp = []
345+
for theme, keywords_dict in keywords_by_theme.items():
349346
if keywords_dict.get("hrfp"):
350-
found_keywords = find_keywords_in_text(article_text, keywords_dict["hrfp"])
351-
result[keys["list_hrfp"]] = extract_keyword_strings(found_keywords)
352-
result[keys["count_hrfp"]] = count_unique_keywords(found_keywords)
353-
354-
# Add to all_keywords_data
355-
for kw_dict in found_keywords:
356-
all_keywords_data.append({
357-
"keyword": kw_dict["keyword"],
347+
for kw in keywords_dict["hrfp"]:
348+
keyword_str = kw["keyword"]
349+
350+
# A keyword can belong to multiple themes - store all mappings
351+
if keyword_str not in keyword_to_metadata_hrfp:
352+
keyword_to_metadata_hrfp[keyword_str] = []
353+
all_keywords_hrfp.append(kw) # Add to search list (once)
354+
355+
keyword_to_metadata_hrfp[keyword_str].append({
358356
"theme": theme,
359-
"category": kw_dict["category"],
360-
"is_hrfp": True
357+
"category": kw["category"]
361358
})
359+
360+
# ONE regex search for ALL non-HRFP keywords (with overlap filtering)
361+
if all_keywords_no_hrfp:
362+
found_keywords_no_hrfp = find_keywords_in_text(article_text, all_keywords_no_hrfp)
363+
364+
# Distribute found keywords to their theme(s)
365+
for kw_dict in found_keywords_no_hrfp:
366+
keyword_str = kw_dict["keyword"]
367+
metadata_list = keyword_to_metadata_no_hrfp.get(keyword_str, [])
362368

363-
if result[keys["count_hrfp"]] > 0:
364-
logging.debug(f"Found {result[keys['count_hrfp']]} unique HRFP keywords for theme {theme}")
369+
# Add this keyword to ALL themes it belongs to
370+
for metadata in metadata_list:
371+
theme = metadata["theme"]
372+
category = metadata["category"]
373+
374+
keys = theme_to_keys.get(theme)
375+
if keys:
376+
# Add to keyword list for this theme
377+
result[keys["list_no_hrfp"]].append(keyword_str)
378+
379+
# Add to all_keywords_data
380+
all_keywords_data.append({
381+
"keyword": keyword_str,
382+
"theme": theme,
383+
"category": category,
384+
"is_hrfp": False
385+
})
386+
387+
# ONE regex search for ALL HRFP keywords (with overlap filtering)
388+
if all_keywords_hrfp:
389+
found_keywords_hrfp = find_keywords_in_text(article_text, all_keywords_hrfp)
390+
391+
# Distribute found keywords to their theme(s)
392+
for kw_dict in found_keywords_hrfp:
393+
keyword_str = kw_dict["keyword"]
394+
metadata_list = keyword_to_metadata_hrfp.get(keyword_str, [])
395+
396+
# Add this keyword to ALL themes it belongs to
397+
for metadata in metadata_list:
398+
theme = metadata["theme"]
399+
category = metadata["category"]
400+
401+
keys = theme_to_keys.get(theme)
402+
if keys:
403+
# Add to keyword list for this theme
404+
result[keys["list_hrfp"]].append(keyword_str)
405+
406+
# Add to all_keywords_data
407+
all_keywords_data.append({
408+
"keyword": keyword_str,
409+
"theme": theme,
410+
"category": category,
411+
"is_hrfp": True
412+
})
413+
414+
# Calculate counts for each theme (unique keywords)
415+
for theme, keys in theme_to_keys.items():
416+
result[keys["count_no_hrfp"]] = len(set(result[keys["list_no_hrfp"]]))
417+
result[keys["count_hrfp"]] = len(set(result[keys["list_hrfp"]]))
418+
419+
if result[keys["count_no_hrfp"]] > 0:
420+
logging.debug(f"Found {result[keys['count_no_hrfp']]} unique non-HRFP keywords for theme {theme}")
421+
if result[keys["count_hrfp"]] > 0:
422+
logging.debug(f"Found {result[keys['count_hrfp']]} unique HRFP keywords for theme {theme}")
365423

366424
# Calculate combined climate solutions (attenuation + adaptation)
367425
# Non-HRFP

0 commit comments

Comments
 (0)