88from src .content_extractor .s3_sequential import S3QuoteExtractor
99from src .content_extractor .base import BaseExtractorConfig
1010from src .content_extractor .highlighter import highlight_occurrence
11+ from src .models .graph_models import (
12+ GraphInput , GraphOutput , Node , NodeData , Edge , EdgeData , Occurrence , Entity
13+ )
1114
1215logger = logging .getLogger (__name__ )
1316
@@ -17,25 +20,19 @@ def slugify(text: str) -> str:
1720 text = re .sub (r'[^a-z0-9]+' , '_' , text )
1821 return text .strip ('_' )
1922
20- def build_registries (entities : List [Dict [ str , Any ] ]) -> Dict [str , Any ]:
21- """Parses entities to map s3_uris to keywords and metadata."""
23+ def build_registries (entities : List [Entity ]) -> Dict [str , Any ]:
24+ """Parses entities to map s3_uris to keywords and metadata based on structured aliases ."""
2225 registry = defaultdict (lambda : {"keywords" : set (), "entities" : []})
2326
2427 for ent in entities :
25- props = ent .get ("properties" , {})
26- source_urls_raw = props .get ("sourceUrls" , [])
27-
28- if isinstance (source_urls_raw , str ):
29- s3_uris = [u .strip () for u in source_urls_raw .split (',' )]
30- else :
31- s3_uris = source_urls_raw
32-
33- aliases = ent .get ("aliases" , [])
34-
35- for uri in s3_uris :
36- if not uri : continue
37- registry [uri ]["keywords" ].update (aliases )
38- registry [uri ]["entities" ].append (ent )
28+ for alias in ent .aliases :
29+ for uri in alias .source_files :
30+ if not uri or not uri .startswith ("s3://" ):
31+ continue
32+ registry [uri ]["keywords" ].add (alias .name )
33+ # Ensure each entity is only added once per unique URI
34+ if ent not in registry [uri ]["entities" ]:
35+ registry [uri ]["entities" ].append (ent )
3936
4037 return registry
4138
@@ -65,63 +62,64 @@ def map_findings_to_entities(raw_findings: List[Dict[str, Any]], registry: Dict[
6562 uri = finding ["source" ]
6663 keyword = finding ["keyword_matched" ]
6764 content = finding ["content" ]
68- link = finding ["link" ] # Use the pre-calculated link from extractor
65+ link = finding ["link" ]
6966
7067 for ent in registry [uri ]["entities" ]:
71- if keyword in ent .get ( " aliases" , [] ):
72- occurrence = {
73- " link" : link ,
74- " context" : highlight_occurrence (content , keyword )
75- }
76- results [ent [ " canonical_key" ] ][keyword ].append (occurrence )
68+ if any ( a . name == keyword for a in ent .aliases ):
69+ occurrence = Occurrence (
70+ link = link ,
71+ context = highlight_occurrence (content , keyword )
72+ )
73+ results [ent . canonical_key ][keyword ].append (occurrence )
7774
7875 return results
7976
80- def build_node_structure (entities : List [Dict [ str , Any ]] , entity_results : Dict [str , Any ]) -> Dict [ str , Any ] :
77+ def build_node_structure (entities : List [Entity ] , entity_results : Dict [str , Any ]) -> GraphOutput :
8178 """Constructs the final list of nodes and edges."""
8279 nodes , edges = [], []
8380
8481 for ent in entities :
85- ent_id = ent [ " canonical_key" ]
86- human_label = ent .get ( " label" ) or ent_id .replace ("_" , " " ).title ()
87- nodes .append ({ " data" : { "id" : ent_id , " label" : human_label , " type" : " entity"}} )
82+ ent_id = ent . canonical_key
83+ human_label = ent .label or ent_id .replace ("_" , " " ).title ()
84+ nodes .append (Node ( data = NodeData ( id = ent_id , label = human_label , type = " entity")) )
8885
8986 # Use a dict to accumulate alias nodes by their slugified ID to avoid duplicates
9087 alias_map = {}
9188
92- for alias in ent .get ("aliases" , []):
89+ for alias_obj in ent .aliases :
90+ alias = alias_obj .name
9391 occurrences = entity_results [ent_id ].get (alias , [])
9492 alias_id = f"{ ent_id } __{ slugify (alias )} "
9593
9694 if alias_id not in alias_map :
97- alias_map [alias_id ] = {
98- "id" : alias_id ,
99- " label" : alias ,
100- " type" : "alias" ,
101- " occurrences" : []
102- }
95+ alias_map [alias_id ] = NodeData (
96+ id = alias_id ,
97+ label = alias ,
98+ type = "alias" ,
99+ occurrences = []
100+ )
103101
104102 if occurrences :
105- alias_map [alias_id ][ " occurrences" ] .extend (occurrences )
103+ alias_map [alias_id ]. occurrences .extend (occurrences )
106104
107105 # Add the deduplicated alias nodes and their edges
108- for alias_id , alias_data in alias_map .items ():
109- # If no occurrences, remove the empty list from the data
110- if not alias_data [ " occurrences" ] :
111- del alias_data [ " occurrences" ]
106+ for alias_id , node_data in alias_map .items ():
107+ # If no occurrences, clear the list (Pydantic will handle Optional)
108+ if not node_data . occurrences :
109+ node_data . occurrences = None
112110
113- nodes .append ({ " data" : alias_data } )
111+ nodes .append (Node ( data = node_data ) )
114112
115- count = len (alias_data . get ( " occurrences" , []))
116- edges .append ({
117- " data" : {
118- " source" : ent_id ,
119- " target" : alias_id ,
120- " label" : f"Alias ({ count } )" if count > 0 else "Alias"
121- }
122- } )
113+ count = len (node_data . occurrences ) if node_data . occurrences else 0
114+ edges .append (Edge (
115+ data = EdgeData (
116+ source = ent_id ,
117+ target = alias_id ,
118+ label = f"Alias ({ count } )" if count > 0 else "Alias"
119+ )
120+ ) )
123121
124- return { " nodes" : nodes , " edges" : edges }
122+ return GraphOutput ( nodes = nodes , edges = edges )
125123
126124async def generate_graph (input_data : Union [str , Dict [str , Any ]], output_path : Optional [str ] = None ):
127125 """Main orchestration function. Can take a file path (str) or a dictionary."""
@@ -134,13 +132,21 @@ async def generate_graph(input_data: Union[str, Dict[str, Any]], output_path: Op
134132 else :
135133 graph_data = input_data
136134
137- entities = graph_data .get ("entities" , [])
135+ # Validate input
136+ try :
137+ validated_input = GraphInput .model_validate (graph_data )
138+ entities = validated_input .entities
139+ except Exception as e :
140+ logger .error (f"Input validation failed: { e } " )
141+ raise
142+
138143 registry = build_registries (entities )
139144
140145 raw_findings = await fetch_extraction_findings (registry )
141146 entity_results = map_findings_to_entities (raw_findings , registry )
142147
143- cy_json = build_node_structure (entities , entity_results )
148+ cy_graph = build_node_structure (entities , entity_results )
149+ cy_json = cy_graph .model_dump (exclude_none = True )
144150
145151 if output_path :
146152 os .makedirs (os .path .dirname (output_path ), exist_ok = True )
0 commit comments