Merge pull request #85 from GenerateNU/07-ingest-service

GordonBie123 · web-flow · commit 3806621eb5b9 · 2026-04-04T10:24:19.000-04:00
Build document ingest service #7
diff --git a/backend/app/services/ingest.py b/backend/app/services/ingest.py
@@ -7,22 +7,89 @@
 import logging
 from pathlib import Path
 
+import cognee
+from cognee import SearchType
+
 logger = logging.getLogger(__name__)
 
 
-async def ingest_document(path: Path, dataset_name: str) -> dict:
-   pass
+async def ingest_document(
+    file_path: str,
+    dataset_name: str,
+    document_id: str = None,
+) -> dict:
+    """
+    Ingest a document into the knowledge graph.
+
+    Calls cognee.add() to ingest the file, then cognee.cognify() to
+    process it into chunks, entities, relationships, and summaries.
+    Finally extracts structured data from the processed results.
+
+    Returns a dict with "status": "success" or "status": "error".
+    """
+    try:
+        await cognee.add(file_path, dataset_name)
+        await cognee.cognify([dataset_name])
+        structured_data = await _extract_structured_data(dataset_name)
+
+        return {
+            "status": "success",
+            "document_id": document_id,
+            "dataset_name": dataset_name,
+            **structured_data,
+        }
+
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e),
+        }
+
+
+async def _extract_structured_data(dataset_name: str) -> dict:
+    """
+    Query Cognee for structured data after cognify() has run.
+
+    Uses SearchType.SUMMARIES for pre-computed summaries and
+    SearchType.CHUNKS for raw text segments.
+
+    Returns summary (str), entities (list), and raw_chunks_count (int).
+    """
+    summary_results = await cognee.search(
+        query_type=SearchType.SUMMARIES,
+        query_text=dataset_name,
+    )
+
+    chunk_results = await cognee.search(
+        query_type=SearchType.CHUNKS,
+        query_text=dataset_name,
+    )
+
+    summary = summary_results[0] if summary_results else ""
+
+    entities = []
+    for chunk in chunk_results:
+        if hasattr(chunk, "entities"):
+            entities.extend(chunk.entities)
+
+    return {
+        "summary": str(summary),
+        "entities": entities,
+        "raw_chunks_count": len(chunk_results),
+    }
+
 
 async def ingest_document_background(path: Path, dataset_name: str) -> None:
     """
-    For FastAPI BackgroundTasks. Allows ingest_document to run in the background for large files.
+    For FastAPI BackgroundTasks. Allows ingest_document to run in the
+    background for large files.
     """
     try:
-        await ingest_document(path, dataset_name)
+        await ingest_document(str(path), dataset_name)
     except Exception:
         logger.error("Background ingest failed for %s", path, exc_info=True)
     finally:
         try:
             path.unlink(missing_ok=True)
         except Exception:
-            pass
+            pass
diff --git a/backend/tests/test_ingest.py b/backend/tests/test_ingest.py
@@ -0,0 +1,45 @@
+"""
+Not runnable yet — blocked on TICKET-01 and TICKET-02 being merged.
+Replace path/to/test.pdf with a real file before running.
+
+Usage:
+    pytest backend/tests/test_ingest.py -v
+"""
+
+import pytest
+from app.services.ingest import ingest_document
+
+
+@pytest.mark.asyncio
+async def test_ingest_document_success():
+    """Test that ingesting a real PDF returns success with structured data."""
+    result = await ingest_document(
+        file_path="../mock_data/DeepFryer-1.pdf",  # Replace with a real PDF
+        dataset_name="test-dataset",
+    )
+
+    assert result["status"] == "success"
+    assert "summary" in result
+    assert "entities" in result
+    assert "raw_chunks_count" in result
+
+    # Summary should not be empty
+    assert len(result["summary"]) > 0
+
+    # Should extract at least one entity
+    assert len(result["entities"]) > 0
+
+    # Should have at least one chunk
+    assert result["raw_chunks_count"] > 0
+
+
+@pytest.mark.asyncio
+async def test_ingest_document_bad_file():
+    """Test that a non-existent file returns an error status."""
+    result = await ingest_document(
+        file_path="nonexistent_file.pdf",
+        dataset_name="test-dataset",
+    )
+
+    assert result["status"] == "error"
+    assert "error" in result