reading and writing graph to s3

Ademola Adefioye · Ademola Adefioye · commit 01e821063677 · 2026-04-22T12:25:06.000+01:00
This change includes functionality to be able to read from s3 and to
write from output directly to s3.
diff --git a/app.py b/app.py
@@ -2,7 +2,7 @@
 import logging
 from flask import Flask, request, jsonify, render_template
 from dotenv import load_dotenv
-from src.generate_graph import generate_graph, load_graph_viewmodel
+from src.generate_graph import generate_graph, load_graph_viewmodel, generate_output_path, summarize_path
 
 load_dotenv()
 
@@ -47,10 +47,14 @@ async def extract_quotes():
         Endpoint that runs the Cytoscape graph generation logic based on graph.json.
         """
         try:
-            logger.info('Starting graph generation process...')
-            graph_data = await generate_graph("graph.json")
+            input_path = request.args.get('input_path')
+            if not input_path:
+                return jsonify({"error": "Missing 'input_path' query parameter"}), 400
+                
+            output_path = generate_output_path(input_path)
+            logger.info(f'Starting graph generation process for {summarize_path(input_path)}...')
+            graph_data = await generate_graph(input_path, output_path)
             logger.info('Graph generation completed successfully.')
-            
             return jsonify(graph_data), 200
 
         except Exception as e:
diff --git a/src/generate_graph.py b/src/generate_graph.py
@@ -8,10 +8,12 @@
 from src.content_extractor.s3_sequential import S3QuoteExtractor
 from src.content_extractor.base import BaseExtractorConfig
 from src.content_extractor.highlighter import highlight_occurrence
+import fsspec
 from src.models.graph_models import (
     GraphInput, GraphOutput, Node, NodeData, Edge, EdgeData, Occurrence, Entity
 )
 
+
 logger = logging.getLogger(__name__)
 
 def slugify(text: str) -> str:
@@ -124,10 +126,11 @@ def build_node_structure(entities: List[Entity], entity_results: Dict[str, Any])
 async def generate_graph(input_data: Union[str, Dict[str, Any]], output_path: Optional[str] = None):
     """Main orchestration function. Can take a file path (str) or a dictionary."""
     if isinstance(input_data, str):
-        if not os.path.exists(input_data):
+        of = fsspec.open(input_data, "r")
+        if not of.fs.exists(of.path):
             logger.error(f"Input file {input_data} not found.")
             return
-        with open(input_data, "r") as f:
+        with of as f:
             graph_data = json.load(f)
     else:
         graph_data = input_data
@@ -149,24 +152,53 @@ async def generate_graph(input_data: Union[str, Dict[str, Any]], output_path: Op
     cy_json = cy_graph.model_dump(exclude_none=True)
     
     if output_path:
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        with open(output_path, "w") as f:
+        with fsspec.open(output_path, "w", auto_mkdir=True) as f:
             json.dump(cy_json, f, indent=4)
-        logger.info(f"Graph saved to {output_path}")
+        logger.info(f"Graph saved to {summarize_path(output_path)}")
     
     return cy_json
 
 def load_json_file(file_path: str) -> Dict[str, Any]:
     """Utility function to load JSON data from a file."""
-    if not os.path.exists(file_path):
+    of = fsspec.open(file_path, "r")
+    if not of.fs.exists(of.path):
         logger.error(f"File {file_path} not found.")
         return {}
-    with open(file_path, "r") as f:
+    with of as f:
         return json.load(f)
 
 def load_graph_viewmodel(file_path: str) -> Dict[str, Any]:
     """Loads the graph viewmodel JSON for the frontend."""
     return load_json_file(file_path)
 
+
+def summarize_path(path: str) -> str:
+    """Extracts a concise representation (project/run) of a path for logging."""
+    match = re.search(r'([^/]+)/(run-\d+-\d+)', path)
+    if match:
+        return f"{match.group(1)}/{match.group(2)}"
+    return path.split('/')[-1]
+
+def generate_output_path(input_path: str) -> str:
+    """Generates the output path for the graph JSON file."""
+    output_dir = os.getenv("OUTPUT_DIRECTORY", "outputs")
+    
+
+    match = re.search(r'(?P<base>s3://[^/]+/)?(?P<project>[^/]+)/(?P<run>run-\d+-\d+)', input_path)
+    
+    if match:
+        base = match.group('base')
+        project = match.group('project')
+        run_id = match.group('run')
+        
+        if base:
+            # Dynamically use the same S3 bucket but route to graph_tools prefix
+            return f"{base}graph_tools/{project}/{run_id}/graphNode.json"
+            
+        return f"{output_dir}/{project}/{run_id}/graphNode.json"
+    
+    summary = summarize_path(input_path)
+    raise ValueError(f"Input path '{summary}' does not contain a recognizable project/run structure.")
+
 if __name__ == "__main__":
     asyncio.run(generate_graph("graph.json", "outputs/graphNode.json"))