Merge pull request #2 from alphagov/feature/dockerize-content-extractor

daymohlar · web-flow · commit 8d6bfc9526ba · 2026-04-20T16:55:11.000+01:00
ACW-19 Content extractor
diff --git a/Dockerfile b/Dockerfile
@@ -17,6 +17,7 @@ RUN uv sync --no-dev --no-install-project
 
 COPY src/ ./src/
 COPY app.py ./app.py
+# COPY graph.json ./
 
 
 
diff --git a/src/content_extractor/s3_sequential.py b/src/content_extractor/s3_sequential.py
@@ -1,6 +1,7 @@
 import asyncio
 import logging
-from typing import List
+import json
+from typing import List, Dict, Any
 from collections import defaultdict
 from .base import BaseQuoteExtractor, Finding, FinalQuoteExtraction, BaseExtractorConfig
 from src.url.generator import generate_url_fragement, s3_to_govuk_url
@@ -11,6 +12,47 @@
 class S3QuoteExtractor(BaseQuoteExtractor):
     """Processes documents sequentially by fetching from S3 and chunking."""
     
+    def __init__(self, config: BaseExtractorConfig):
+        super().__init__(config)
+        self.url_map: Dict[str, str] = {}
+
+    def _fetch_url_map(self, s3_uris: List[str]):
+        """
+        Attempts to fetch sources.json files from the directories of the input files.
+        Deduplicates potential sources.json locations and merges their mappings.
+        """
+        if not s3_uris:
+            return
+
+        sources_locations = set()
+        for uri in s3_uris:
+            if uri in self.url_map:
+                continue
+                
+            if "/input/" in uri:
+                sources_uri = uri.split("/input/")[0] + "/input/sources.json"
+            else:
+                sources_uri = "/".join(uri.split("/")[:-1]) + "/sources.json"
+            sources_locations.add(sources_uri)
+
+        for sources_uri in sources_locations:
+            logger.info(f"Attempting to fetch sources map from {sources_uri}...")
+            content = self.fetch_s3_content(sources_uri)
+            if content:
+                try:
+                    new_map = json.loads(content)
+                    self.url_map.update(new_map)
+                    logger.info(f"Successfully loaded {len(new_map)} mappings from {sources_uri}.")
+                except Exception as e:
+                    logger.error(f"Failed to parse {sources_uri}: {e}")
+            else:
+                logger.warning(f"No sources.json found at {sources_uri}.")
+        
+        if self.url_map:
+            logger.info(f"Total URL mappings loaded: {len(self.url_map)}")
+        else:
+            logger.warning("No URL mappings loaded. Falling back to derived URLs.")
+
     async def process_document(self, s3_uri: str, keywords: List[str], results_list: list):
         """Processes a single document for a specific set of keywords."""
         content = self.fetch_s3_content(s3_uri)
@@ -20,6 +62,8 @@ async def process_document(self, s3_uri: str, keywords: List[str], results_list:
         if len(chunks) > 1:
             logger.info(f"  Split {s3_uri} into {len(chunks)} chunks.")
 
+        base_govuk_url = s3_to_govuk_url(s3_uri, self.url_map)
+
         for i, chunk in enumerate(chunks, 1):
             prompt = (
                 f"Keywords: {', '.join(keywords)}\n\n"
@@ -32,24 +76,26 @@ async def process_document(self, s3_uri: str, keywords: List[str], results_list:
                         "content": q.content,
                         "keyword_matched": q.keyword_matched,
                         "source": s3_uri,
-                        "link": generate_url_fragement(s3_to_govuk_url(s3_uri), q.content)
+                        "link": generate_url_fragement(base_govuk_url, q.content)
                     })
             except Exception as e:
                 logger.error(f"  Error in {s3_uri} chunk {i}: {e}")
 
-    async def run_mapping(self, doc_to_keywords: dict):
+    async def run_mapping(self, doc_to_keywords: Dict[str, List[str]]):
         """Processes documents based on a mapping of {s3_uri: [keywords]}."""
         raw_findings = []
         
+        self._fetch_url_map(list(doc_to_keywords.keys()))
+        
         tasks = [
             self.process_document(uri, keywords, raw_findings) 
             for uri, keywords in doc_to_keywords.items()
         ]
         await asyncio.gather(*tasks)
         return raw_findings
 
-    async def run(self, output_file: str = "outputs/extracted_quotes.json"):
-        """Main entry point to run extraction and save results."""
+    async def run(self):
+        """Main entry point to run extraction"""
         doc_to_keywords = {uri: self.config.keywords for uri in self.config.s3_documents}
         raw_findings = await self.run_mapping(doc_to_keywords)
 
diff --git a/src/generate_graph.py b/src/generate_graph.py
@@ -8,6 +8,9 @@
 from src.content_extractor.s3_sequential import S3QuoteExtractor
 from src.content_extractor.base import BaseExtractorConfig
 from src.content_extractor.highlighter import highlight_occurrence
+from src.models.graph_models import (
+    GraphInput, GraphOutput, Node, NodeData, Edge, EdgeData, Occurrence, Entity
+)
 
 logger = logging.getLogger(__name__)
 
@@ -17,25 +20,19 @@ def slugify(text: str) -> str:
     text = re.sub(r'[^a-z0-9]+', '_', text)
     return text.strip('_')
 
-def build_registries(entities: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """Parses entities to map s3_uris to keywords and metadata."""
+def build_registries(entities: List[Entity]) -> Dict[str, Any]:
+    """Parses entities to map s3_uris to keywords and metadata based on structured aliases."""
     registry = defaultdict(lambda: {"keywords": set(), "entities": []})
     
     for ent in entities:
-        props = ent.get("properties", {})
-        source_urls_raw = props.get("sourceUrls", [])
-        
-        if isinstance(source_urls_raw, str):
-            s3_uris = [u.strip() for u in source_urls_raw.split(',')]
-        else:
-            s3_uris = source_urls_raw
-            
-        aliases = ent.get("aliases", [])
-        
-        for uri in s3_uris:
-            if not uri: continue
-            registry[uri]["keywords"].update(aliases)
-            registry[uri]["entities"].append(ent)
+        for alias in ent.aliases:
+            for uri in alias.source_files:
+                if not uri or not uri.startswith("s3://"):
+                    continue
+                registry[uri]["keywords"].add(alias.name)
+                # Ensure each entity is only added once per unique URI
+                if ent not in registry[uri]["entities"]:
+                    registry[uri]["entities"].append(ent)
             
     return registry
 
@@ -65,63 +62,64 @@ def map_findings_to_entities(raw_findings: List[Dict[str, Any]], registry: Dict[
         uri = finding["source"]
         keyword = finding["keyword_matched"]
         content = finding["content"]
-        link = finding["link"] # Use the pre-calculated link from extractor
+        link = finding["link"] 
         
         for ent in registry[uri]["entities"]:
-            if keyword in ent.get("aliases", []):
-                occurrence = {
-                    "link": link,
-                    "context": highlight_occurrence(content, keyword)
-                }
-                results[ent["canonical_key"]][keyword].append(occurrence)
+            if any(a.name == keyword for a in ent.aliases):
+                occurrence = Occurrence(
+                    link=link,
+                    context=highlight_occurrence(content, keyword)
+                )
+                results[ent.canonical_key][keyword].append(occurrence)
                 
     return results
 
-def build_node_structure(entities: List[Dict[str, Any]], entity_results: Dict[str, Any]) -> Dict[str, Any]:
+def build_node_structure(entities: List[Entity], entity_results: Dict[str, Any]) -> GraphOutput:
     """Constructs the final list of nodes and edges."""
     nodes, edges = [], []
 
     for ent in entities:
-        ent_id = ent["canonical_key"]
-        human_label = ent.get("label") or ent_id.replace("_", " ").title()
-        nodes.append({"data": {"id": ent_id, "label": human_label, "type": "entity"}})
+        ent_id = ent.canonical_key
+        human_label = ent.label or ent_id.replace("_", " ").title()
+        nodes.append(Node(data=NodeData(id=ent_id, label=human_label, type="entity")))
         
         # Use a dict to accumulate alias nodes by their slugified ID to avoid duplicates
         alias_map = {}
         
-        for alias in ent.get("aliases", []):
+        for alias_obj in ent.aliases:
+            alias = alias_obj.name
             occurrences = entity_results[ent_id].get(alias, [])
             alias_id = f"{ent_id}__{slugify(alias)}"
             
             if alias_id not in alias_map:
-                alias_map[alias_id] = {
-                    "id": alias_id,
-                    "label": alias,
-                    "type": "alias",
-                    "occurrences": []
-                }
+                alias_map[alias_id] = NodeData(
+                    id=alias_id,
+                    label=alias,
+                    type="alias",
+                    occurrences=[]
+                )
             
             if occurrences:
-                alias_map[alias_id]["occurrences"].extend(occurrences)
+                alias_map[alias_id].occurrences.extend(occurrences)
         
         # Add the deduplicated alias nodes and their edges
-        for alias_id, alias_data in alias_map.items():
-            # If no occurrences, remove the empty list from the data
-            if not alias_data["occurrences"]:
-                del alias_data["occurrences"]
+        for alias_id, node_data in alias_map.items():
+            # If no occurrences, clear the list (Pydantic will handle Optional)
+            if not node_data.occurrences:
+                node_data.occurrences = None
             
-            nodes.append({"data": alias_data})
+            nodes.append(Node(data=node_data))
             
-            count = len(alias_data.get("occurrences", []))
-            edges.append({
-                "data": {
-                    "source": ent_id,
-                    "target": alias_id,
-                    "label": f"Alias ({count})" if count > 0 else "Alias"
-                }
-            })
+            count = len(node_data.occurrences) if node_data.occurrences else 0
+            edges.append(Edge(
+                data=EdgeData(
+                    source=ent_id,
+                    target=alias_id,
+                    label=f"Alias ({count})" if count > 0 else "Alias"
+                )
+            ))
 
-    return {"nodes": nodes, "edges": edges}
+    return GraphOutput(nodes=nodes, edges=edges)
 
 async def generate_graph(input_data: Union[str, Dict[str, Any]], output_path: Optional[str] = None):
     """Main orchestration function. Can take a file path (str) or a dictionary."""
@@ -134,13 +132,21 @@ async def generate_graph(input_data: Union[str, Dict[str, Any]], output_path: Op
     else:
         graph_data = input_data
     
-    entities = graph_data.get("entities", [])
+    # Validate input
+    try:
+        validated_input = GraphInput.model_validate(graph_data)
+        entities = validated_input.entities
+    except Exception as e:
+        logger.error(f"Input validation failed: {e}")
+        raise
+
     registry = build_registries(entities)
     
     raw_findings = await fetch_extraction_findings(registry)
     entity_results = map_findings_to_entities(raw_findings, registry)
     
-    cy_json = build_node_structure(entities, entity_results)
+    cy_graph = build_node_structure(entities, entity_results)
+    cy_json = cy_graph.model_dump(exclude_none=True)
     
     if output_path:
         os.makedirs(os.path.dirname(output_path), exist_ok=True)
diff --git a/src/models/__init__.py b/src/models/__init__.py
diff --git a/src/models/graph_models.py b/src/models/graph_models.py
@@ -0,0 +1,47 @@
+from pydantic import BaseModel, Field, ConfigDict
+from typing import List, Optional, Dict, Union, Any, Literal
+
+class Alias(BaseModel):
+    name: str
+    source_files: List[str] = Field(default_factory=list)
+
+class Entity(BaseModel):
+    id: str
+    canonical_key: str
+    label: Optional[str] = None
+    aliases: List[Alias] = Field(default_factory=list)
+    properties: Dict[str, Any] = Field(default_factory=dict)
+    type: Optional[str] = None
+    description: Optional[str] = None
+
+    model_config = ConfigDict(extra="allow")
+
+class GraphInput(BaseModel):
+    entities: List[Entity]
+    
+    model_config = ConfigDict(extra="allow")
+
+class Occurrence(BaseModel):
+    link: str
+    context: str
+
+class NodeData(BaseModel):
+    id: str
+    label: str
+    type: Literal["entity", "alias"]
+    occurrences: Optional[List[Occurrence]] = None
+
+class Node(BaseModel):
+    data: NodeData
+
+class EdgeData(BaseModel):
+    source: str
+    target: str
+    label: str
+
+class Edge(BaseModel):
+    data: EdgeData
+
+class GraphOutput(BaseModel):
+    nodes: List[Node]
+    edges: List[Edge]
diff --git a/src/url/generator.py b/src/url/generator.py
@@ -1,20 +1,25 @@
 import urllib.parse
+from typing import Optional
 
-def convert_string_to_url_query_format(text: str):
-    # For GOV.UK text fragments (#:~:text=), characters like '-' are reserved
-    # syntax characters and must be percent-encoded. 
-    # Python's urllib.parse.quote never quotes '-', '.', '_', or '~'.
-    # So we manually encode '-' to ensure it works with text fragments.
+def convert_string_to_url_query_format(text: str)-> str:
     quoted = urllib.parse.quote(text, safe='')
-    return quoted.replace('-', '%2D')
+    quoted= (quoted
+            .replace('-', '%2D')
+            .replace('.', '%2E')
+            .replace('~', '%7E')
+            .replace('_', '%5F'))
+    return quoted
 
 def generate_url_fragement(base_url: str, content: str):
     encoded_content = convert_string_to_url_query_format(content)
     url = f"{base_url}#:~:text={encoded_content}"
     return url
 
-def s3_to_govuk_url(s3_uri: str) -> str:
-    """Derives a GOV.UK URL directly from an S3 URI by stripping the prefix and extension."""
+def s3_to_govuk_url(s3_uri: str, url_map: Optional[dict] = None) -> str:
+    """Derives a GOV.UK URL from an S3 URI, using url_map if provided, otherwise using fallback logic."""
+    if url_map and s3_uri in url_map:
+        return url_map[s3_uri]
+
     if "/input/" in s3_uri:
         path = s3_uri.split("/input/")[-1]
     else:
diff --git a/tests/test_graph_validation.py b/tests/test_graph_validation.py

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ RUN uv sync --no-dev --no-install-project`
`17`	`17`
`18`	`18`	`COPY src/ ./src/`
`19`	`19`	`COPY app.py ./app.py`
	`20`	`+# COPY graph.json ./`
`20`	`21`
`21`	`22`
`22`	`23`