@@ -246,8 +246,7 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
246246 "all_keywords" : [],
247247 }
248248
249- # Map theme names to keys (using base theme names without suffixes)
250- # The get_keywords_by_theme_and_hrfp() function already combines themes with their _indirectes variants
249+ # Map theme names to keys
251250 theme_to_keys = {
252251 "changement_climatique_constat" : {
253252 "count_no_hrfp" : "number_of_changement_climatique_constat_no_hrfp" ,
@@ -320,48 +319,107 @@ def extract_keyword_data_from_article(article_text: str) -> Dict:
320319 # Storage for all_keywords construction
321320 all_keywords_data = []
322321
323- # Find keywords for each theme (both HRFP and non-HRFP)
322+ # Mappings: keyword → list of (theme, category, is_hrfp)
323+ keyword_to_metadata_no_hrfp = {}
324+ keyword_to_metadata_hrfp = {}
325+
326+ # Collect all non-HRFP keywords
327+ all_keywords_no_hrfp = []
324328 for theme , keywords_dict in keywords_by_theme .items ():
325- keys = theme_to_keys .get (theme )
326- if not keys :
327- logging .debug (f"Skipping theme { theme } - no mapping found" )
328- continue
329-
330- # Process non-HRFP keywords
331329 if keywords_dict .get ("non_hrfp" ):
332- found_keywords = find_keywords_in_text (article_text , keywords_dict ["non_hrfp" ])
333- result [keys ["list_no_hrfp" ]] = extract_keyword_strings (found_keywords )
334- result [keys ["count_no_hrfp" ]] = count_unique_keywords (found_keywords )
335-
336- # Add to all_keywords_data
337- for kw_dict in found_keywords :
338- all_keywords_data .append ({
339- "keyword" : kw_dict ["keyword" ],
330+ for kw in keywords_dict ["non_hrfp" ]:
331+ keyword_str = kw ["keyword" ]
332+
333+ # A keyword can belong to multiple themes - store all mappings
334+ if keyword_str not in keyword_to_metadata_no_hrfp :
335+ keyword_to_metadata_no_hrfp [keyword_str ] = []
336+ all_keywords_no_hrfp .append (kw )
337+
338+ keyword_to_metadata_no_hrfp [keyword_str ].append ({
340339 "theme" : theme ,
341- "category" : kw_dict ["category" ],
342- "is_hrfp" : False
340+ "category" : kw ["category" ]
343341 })
344-
345- if result [keys ["count_no_hrfp" ]] > 0 :
346- logging .debug (f"Found { result [keys ['count_no_hrfp' ]]} unique non-HRFP keywords for theme { theme } " )
347-
348- # Process HRFP keywords
342+
343+ # Collect all HRFP keywords
344+ all_keywords_hrfp = []
345+ for theme , keywords_dict in keywords_by_theme .items ():
349346 if keywords_dict .get ("hrfp" ):
350- found_keywords = find_keywords_in_text (article_text , keywords_dict ["hrfp" ])
351- result [keys ["list_hrfp" ]] = extract_keyword_strings (found_keywords )
352- result [keys ["count_hrfp" ]] = count_unique_keywords (found_keywords )
353-
354- # Add to all_keywords_data
355- for kw_dict in found_keywords :
356- all_keywords_data .append ({
357- "keyword" : kw_dict ["keyword" ],
347+ for kw in keywords_dict ["hrfp" ]:
348+ keyword_str = kw ["keyword" ]
349+
350+ # A keyword can belong to multiple themes - store all mappings
351+ if keyword_str not in keyword_to_metadata_hrfp :
352+ keyword_to_metadata_hrfp [keyword_str ] = []
353+ all_keywords_hrfp .append (kw ) # Add to search list (once)
354+
355+ keyword_to_metadata_hrfp [keyword_str ].append ({
358356 "theme" : theme ,
359- "category" : kw_dict ["category" ],
360- "is_hrfp" : True
357+ "category" : kw ["category" ]
361358 })
359+
360+ # ONE regex search for ALL non-HRFP keywords (with overlap filtering)
361+ if all_keywords_no_hrfp :
362+ found_keywords_no_hrfp = find_keywords_in_text (article_text , all_keywords_no_hrfp )
363+
364+ # Distribute found keywords to their theme(s)
365+ for kw_dict in found_keywords_no_hrfp :
366+ keyword_str = kw_dict ["keyword" ]
367+ metadata_list = keyword_to_metadata_no_hrfp .get (keyword_str , [])
362368
363- if result [keys ["count_hrfp" ]] > 0 :
364- logging .debug (f"Found { result [keys ['count_hrfp' ]]} unique HRFP keywords for theme { theme } " )
369+ # Add this keyword to ALL themes it belongs to
370+ for metadata in metadata_list :
371+ theme = metadata ["theme" ]
372+ category = metadata ["category" ]
373+
374+ keys = theme_to_keys .get (theme )
375+ if keys :
376+ # Add to keyword list for this theme
377+ result [keys ["list_no_hrfp" ]].append (keyword_str )
378+
379+ # Add to all_keywords_data
380+ all_keywords_data .append ({
381+ "keyword" : keyword_str ,
382+ "theme" : theme ,
383+ "category" : category ,
384+ "is_hrfp" : False
385+ })
386+
387+ # ONE regex search for ALL HRFP keywords (with overlap filtering)
388+ if all_keywords_hrfp :
389+ found_keywords_hrfp = find_keywords_in_text (article_text , all_keywords_hrfp )
390+
391+ # Distribute found keywords to their theme(s)
392+ for kw_dict in found_keywords_hrfp :
393+ keyword_str = kw_dict ["keyword" ]
394+ metadata_list = keyword_to_metadata_hrfp .get (keyword_str , [])
395+
396+ # Add this keyword to ALL themes it belongs to
397+ for metadata in metadata_list :
398+ theme = metadata ["theme" ]
399+ category = metadata ["category" ]
400+
401+ keys = theme_to_keys .get (theme )
402+ if keys :
403+ # Add to keyword list for this theme
404+ result [keys ["list_hrfp" ]].append (keyword_str )
405+
406+ # Add to all_keywords_data
407+ all_keywords_data .append ({
408+ "keyword" : keyword_str ,
409+ "theme" : theme ,
410+ "category" : category ,
411+ "is_hrfp" : True
412+ })
413+
414+ # Calculate counts for each theme (unique keywords)
415+ for theme , keys in theme_to_keys .items ():
416+ result [keys ["count_no_hrfp" ]] = len (set (result [keys ["list_no_hrfp" ]]))
417+ result [keys ["count_hrfp" ]] = len (set (result [keys ["list_hrfp" ]]))
418+
419+ if result [keys ["count_no_hrfp" ]] > 0 :
420+ logging .debug (f"Found { result [keys ['count_no_hrfp' ]]} unique non-HRFP keywords for theme { theme } " )
421+ if result [keys ["count_hrfp" ]] > 0 :
422+ logging .debug (f"Found { result [keys ['count_hrfp' ]]} unique HRFP keywords for theme { theme } " )
365423
366424 # Calculate combined climate solutions (attenuation + adaptation)
367425 # Non-HRFP
0 commit comments