Skip to content

Commit 3806621

Browse files
authored
Merge pull request #85 from GenerateNU/07-ingest-service
Build document ingest service #7
2 parents 8918d17 + 198a388 commit 3806621

File tree

2 files changed

+117
-5
lines changed

2 files changed

+117
-5
lines changed

backend/app/services/ingest.py

Lines changed: 72 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,89 @@
77
import logging
88
from pathlib import Path
99

10+
import cognee
11+
from cognee import SearchType
12+
1013
logger = logging.getLogger(__name__)
1114

1215

13-
async def ingest_document(path: Path, dataset_name: str) -> dict:
14-
pass
16+
async def ingest_document(
17+
file_path: str,
18+
dataset_name: str,
19+
document_id: str = None,
20+
) -> dict:
21+
"""
22+
Ingest a document into the knowledge graph.
23+
24+
Calls cognee.add() to ingest the file, then cognee.cognify() to
25+
process it into chunks, entities, relationships, and summaries.
26+
Finally extracts structured data from the processed results.
27+
28+
Returns a dict with "status": "success" or "status": "error".
29+
"""
30+
try:
31+
await cognee.add(file_path, dataset_name)
32+
await cognee.cognify([dataset_name])
33+
structured_data = await _extract_structured_data(dataset_name)
34+
35+
return {
36+
"status": "success",
37+
"document_id": document_id,
38+
"dataset_name": dataset_name,
39+
**structured_data,
40+
}
41+
42+
except Exception as e:
43+
return {
44+
"status": "error",
45+
"error": str(e),
46+
}
47+
48+
49+
async def _extract_structured_data(dataset_name: str) -> dict:
50+
"""
51+
Query Cognee for structured data after cognify() has run.
52+
53+
Uses SearchType.SUMMARIES for pre-computed summaries and
54+
SearchType.CHUNKS for raw text segments.
55+
56+
Returns summary (str), entities (list), and raw_chunks_count (int).
57+
"""
58+
summary_results = await cognee.search(
59+
query_type=SearchType.SUMMARIES,
60+
query_text=dataset_name,
61+
)
62+
63+
chunk_results = await cognee.search(
64+
query_type=SearchType.CHUNKS,
65+
query_text=dataset_name,
66+
)
67+
68+
summary = summary_results[0] if summary_results else ""
69+
70+
entities = []
71+
for chunk in chunk_results:
72+
if hasattr(chunk, "entities"):
73+
entities.extend(chunk.entities)
74+
75+
return {
76+
"summary": str(summary),
77+
"entities": entities,
78+
"raw_chunks_count": len(chunk_results),
79+
}
80+
1581

1682
async def ingest_document_background(path: Path, dataset_name: str) -> None:
1783
"""
18-
For FastAPI BackgroundTasks. Allows ingest_document to run in the background for large files.
84+
For FastAPI BackgroundTasks. Allows ingest_document to run in the
85+
background for large files.
1986
"""
2087
try:
21-
await ingest_document(path, dataset_name)
88+
await ingest_document(str(path), dataset_name)
2289
except Exception:
2390
logger.error("Background ingest failed for %s", path, exc_info=True)
2491
finally:
2592
try:
2693
path.unlink(missing_ok=True)
2794
except Exception:
28-
pass
95+
pass

backend/tests/test_ingest.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""
2+
Not runnable yet — blocked on TICKET-01 and TICKET-02 being merged.
3+
Replace path/to/test.pdf with a real file before running.
4+
5+
Usage:
6+
pytest backend/tests/test_ingest.py -v
7+
"""
8+
9+
import pytest
10+
from app.services.ingest import ingest_document
11+
12+
13+
@pytest.mark.asyncio
14+
async def test_ingest_document_success():
15+
"""Test that ingesting a real PDF returns success with structured data."""
16+
result = await ingest_document(
17+
file_path="../mock_data/DeepFryer-1.pdf", # Replace with a real PDF
18+
dataset_name="test-dataset",
19+
)
20+
21+
assert result["status"] == "success"
22+
assert "summary" in result
23+
assert "entities" in result
24+
assert "raw_chunks_count" in result
25+
26+
# Summary should not be empty
27+
assert len(result["summary"]) > 0
28+
29+
# Should extract at least one entity
30+
assert len(result["entities"]) > 0
31+
32+
# Should have at least one chunk
33+
assert result["raw_chunks_count"] > 0
34+
35+
36+
@pytest.mark.asyncio
37+
async def test_ingest_document_bad_file():
38+
"""Test that a non-existent file returns an error status."""
39+
result = await ingest_document(
40+
file_path="nonexistent_file.pdf",
41+
dataset_name="test-dataset",
42+
)
43+
44+
assert result["status"] == "error"
45+
assert "error" in result

0 commit comments

Comments
 (0)