Skip to content

Commit aae36c4

Browse files
content search API optimize (open-edge-platform#2338)
1 parent 69d9947 commit aae36c4

54 files changed

Lines changed: 872 additions & 414 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

education-ai-suite/smart-classroom/content_search/README.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,56 @@
11
# Content Search
22
## Prerequisites
3+
1 python3
4+
python3.10
5+
6+
2 postgreSQL
7+
postgreSQL installation refers to [PostgreSQL installation](./docs/dev_guide/Installation.md#postgresql)
8+
3 Minio
9+
minio installation refers to [Minio installation](./docs/dev_guide/Installation.md#minio)
10+
11+
4 System Tools: Required for multimodal processing:
12+
13+
- Tesseract OCR: For image/PDF text extraction.
14+
- Poppler: For PDF rendering.
15+
316
## Environment Setup
17+
### Create/activate python venv
18+
```powershell
19+
# Create venv
20+
& '<your python dir>' -m venv venv
21+
.\venv\Scripts\Activate.ps1
22+
```
23+
24+
###
25+
```powershell
26+
cd xxx/content_search
27+
python -m pip install --upgrade pip
28+
pip install -r .\requirements.txt
29+
```
430
## Launch
31+
```powershell
32+
cd xxx/content_search
33+
python .\start_services.py
34+
```
535
// todo
36+
## Avaliable Endpoints
37+
38+
| Endpoint | Method | Pattern | Description | Status |
39+
| :--- | :---: | :---: | :--- | :---: |
40+
| `/api/v1/system/health` | **GET** | SYNC | Backend app health check | DONE |
41+
| `/api/v1/task/query/{task_id}` | **GET** | SYNC | Query status of a specific task | DONE |
42+
| `/api/v1/task/list` | **GET** | SYNC | Query tasks by conditions (e.g., `?status=PROCESSING`) | DONE |
43+
| `/api/v1/task/cancel/{task_id}` | **POST** | SYNC | Cancel a running task | WIP |
44+
| `/api/v1/task/pause/{task_id}` | **POST** | SYNC | Pause a running task | WIP |
45+
| `/api/v1/task/resume/{task_id}` | **POST** | SYNC | Resume a paused task | WIP |
46+
| `/api/v1/object/files` | **GET** | SYNC | Query files in MinIO with filters | DONE |
47+
| `/api/v1/object/upload` | **POST** | ASYNC | Upload a file to MinIO | DONE |
48+
| `/api/v1/object/ingest` | **POST** | ASYNC | Ingest a specific file from MinIO | WIP |
49+
| `/api/v1/object/ingest-text` | **POST** | ASYNC | Emedding a raw text | WIP |
50+
| `/api/v1/object/upload-ingest` | **POST** | ASYNC | Upload to MinIO and trigger ingestion | DONE |
51+
| `/api/v1/object/search` | **POST** | ASYNC | Search for files based on description | DONE |
52+
| `/api/v1/object/download` | **POST** | STREAM | Download file from MinIO | DONE |
53+
| `/api/v1/video/summarization` | **POST** | STREAM | Generate video summarization | WIP |
654

755
## API reference
856
[Content Search API reference](./docs/dev_guide/Content_search_API.md)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
1+
#
12
# Copyright (C) 2026 Intel Corporation
23
# SPDX-License-Identifier: Apache-2.0
4+
#
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#
2+
# Copyright (C) 2026 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#
2+
# Copyright (C) 2026 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1+
#
2+
# Copyright (C) 2026 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
16
# api/v1/api.py
27
from fastapi import APIRouter
3-
from api.v1.endpoints import system, object, task
8+
from api.v1.endpoints import system, object, task, vecdatabase
49

510
api_router = APIRouter()
611

712
api_router.include_router(system.router, prefix="/system", tags=["System"])
813
api_router.include_router(object.router, prefix="/object", tags=["EDU-AI Process"])
914
api_router.include_router(task.router, prefix="/task", tags=["EDU-AI Task"])
10-
15+
api_router.include_router(vecdatabase.router, prefix="/vecdb", tags=["Chroma Database"])
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#
2+
# Copyright (C) 2026 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+

education-ai-suite/smart-classroom/content_search/api/v1/endpoints/object.py

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
1+
#
2+
# Copyright (C) 2026 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
16
from fastapi import APIRouter, Depends, HTTPException, File, UploadFile, BackgroundTasks
27
from fastapi.responses import StreamingResponse
38
from sqlalchemy.orm import Session
4-
from database import get_db
5-
from services.task_service import task_service
6-
from services.storage_service import storage_service
7-
from services.search_service import search_service
9+
from utils.database import get_db
10+
from utils.task_service import task_service
11+
from utils.storage_service import storage_service
12+
from utils.search_service import search_service
813
import urllib.parse
914
import mimetypes
10-
from core.responses import resp_200
15+
from utils.core_responses import resp_200
1116

1217
router = APIRouter()
1318

@@ -26,7 +31,55 @@ async def upload_video(background_tasks: BackgroundTasks, file: UploadFile = Fil
2631
message="File received, processing started."
2732
)
2833

29-
# @router.post("/ingest")
34+
@router.post("/ingest")
35+
async def ingest_existing_file(
36+
payload: dict,
37+
background_tasks: BackgroundTasks,
38+
db: Session = Depends(get_db)
39+
):
40+
bucket_name = payload.get("bucket_name", "content-search")
41+
file_key = payload.get("file_key")
42+
if not file_key:
43+
raise HTTPException(status_code=400, detail="file_key is required")
44+
45+
minio_payload = {
46+
"file_key": file_key,
47+
"bucket_name": bucket_name,
48+
}
49+
result = await task_service.handle_file_ingest(db, minio_payload, background_tasks)
50+
51+
return resp_200(
52+
data={
53+
"task_id": str(result["task_id"]),
54+
"status": result["status"],
55+
"file_key": file_key
56+
},
57+
message="Ingestion process started for existing file"
58+
)
59+
60+
@router.post("/ingest-text")
61+
async def ingest_raw_text(
62+
payload: dict,
63+
background_tasks: BackgroundTasks,
64+
db: Session = Depends(get_db)
65+
):
66+
text = payload.get("text")
67+
if not text:
68+
raise HTTPException(status_code=400, detail="Text content is required")
69+
70+
result = await task_service.handle_text_ingest(
71+
db,
72+
payload,
73+
background_tasks
74+
)
75+
76+
return resp_200(
77+
data={
78+
"task_id": str(result["task_id"]),
79+
"status": result["status"]
80+
},
81+
message="Text ingestion started"
82+
)
3083

3184
@router.post("/upload-ingest")
3285
async def upload_file_with_ingest(

education-ai-suite/smart-classroom/content_search/api/v1/endpoints/system.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
#
2+
# Copyright (C) 2026 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
16
from fastapi import APIRouter, Depends
27
from sqlalchemy.orm import Session
38
from sqlalchemy import text
4-
from database import get_db
5-
from services.storage_service import storage_service
9+
from utils.database import get_db
10+
from utils.storage_service import storage_service
611
import time
712

813
router = APIRouter()
@@ -39,4 +44,4 @@ async def health_check(db: Session = Depends(get_db)):
3944
status["services"]["minio"] = f"offline: {err}"
4045
status["status"] = "unhealthy"
4146

42-
return status
47+
return status

education-ai-suite/smart-classroom/content_search/api/v1/endpoints/task.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1+
#
2+
# Copyright (C) 2026 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
16
from fastapi import APIRouter, Depends, HTTPException, Query
27
from sqlalchemy.orm import Session
38
from typing import Optional, List
4-
from database import get_db
5-
from crud.task_crud import task_crud
9+
from utils.database import get_db
10+
from utils.crud_task import task_crud
611
from uuid import UUID
7-
from core.models import AITask
8-
from core.responses import resp_200
12+
from utils.core_models import AITask
13+
from utils.core_responses import resp_200
914

1015
router = APIRouter()
1116
@router.get("/list")
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import logging
2+
from fastapi import APIRouter, HTTPException
3+
from typing import Optional, List, Dict, Any
4+
5+
from providers.chromadb_wrapper.chroma_client import ChromaClientWrapper
6+
7+
logger = logging.getLogger(__name__)
8+
9+
# Initialize the ChromaDB Wrapper
10+
chroma_db = ChromaClientWrapper()
11+
12+
router = APIRouter()
13+
14+
# --- Data Query Endpoints ---
15+
16+
@router.get("/list-ids")
17+
async def list_ids(collection_name: str):
18+
"""
19+
Retrieve all document IDs present in a specific collection.
20+
"""
21+
results = chroma_db.query_all(collection_name=collection_name)
22+
if not results:
23+
# Return empty list instead of 404 to avoid breaking frontend loops
24+
return {"ids": [], "count": 0}
25+
26+
ids = [item['id'] for item in results]
27+
return {"ids": ids, "count": len(ids)}
28+
29+
@router.post("/get-by-ids")
30+
async def get_by_ids(collection_name: str, ids: List[str], include_vector: bool = False):
31+
"""
32+
Fetch specific records (metadata and optionally vectors) by their IDs.
33+
"""
34+
output_fields = ['meta']
35+
if include_vector:
36+
output_fields.append('vector')
37+
38+
try:
39+
results = chroma_db.get(ids=ids, output_fields=output_fields, collection_name=collection_name)
40+
return {"results": results}
41+
except Exception as e:
42+
logger.error(f"Error fetching IDs {ids}: {e}")
43+
raise HTTPException(status_code=500, detail="Internal server error during data retrieval")
44+
45+
@router.post("/search")
46+
async def search_vectors(
47+
collection_name: str,
48+
query_embeddings: List[List[float]],
49+
n_results: int = 5,
50+
where: Optional[Dict[str, Any]] = None
51+
):
52+
"""
53+
Perform vector similarity search.
54+
Main entry point for 'search by image' or 'search by text' features.
55+
"""
56+
try:
57+
results = chroma_db.query(
58+
collection_name=collection_name,
59+
query_embeddings=query_embeddings,
60+
where=where,
61+
n_results=n_results
62+
)
63+
return {"results": results}
64+
except Exception as e:
65+
logger.error(f"Vector search failed in {collection_name}: {e}")
66+
raise HTTPException(status_code=500, detail="Vector search execution failed")
67+
68+
# --- Data Manipulation Endpoints ---
69+
70+
@router.post("/insert")
71+
async def insert_data(collection_name: str, data: List[Dict[str, Any]]):
72+
"""
73+
Insert new vector data and metadata into the collection.
74+
Expects data format: [{"id": "uuid", "vector": [...], "meta": {...}}]
75+
"""
76+
try:
77+
res = chroma_db.insert(data=data, collection_name=collection_name)
78+
return {"status": "success", "info": res}
79+
except Exception as e:
80+
logger.error(f"Insertion failed: {e}")
81+
raise HTTPException(status_code=400, detail="Invalid data format or database connection error")
82+
83+
@router.delete("/delete")
84+
async def delete_data(collection_name: str, ids: List[str]):
85+
"""
86+
Remove records from the collection by ID.
87+
"""
88+
try:
89+
res = chroma_db.delete(ids=ids, collection_name=collection_name)
90+
return {"status": "success", "info": res}
91+
except Exception as e:
92+
logger.error(f"Deletion failed: {e}")
93+
raise HTTPException(status_code=400, detail="Failed to delete specified IDs")
94+
95+
# --- Collection Management Endpoints ---
96+
97+
@router.get("/collections")
98+
async def list_collections():
99+
"""
100+
List all available collections in the ChromaDB instance.
101+
"""
102+
try:
103+
collections = chroma_db.client.list_collections()
104+
return {"collections": [c.name for c in collections]}
105+
except Exception as e:
106+
logger.error(f"Could not list collections: {e}")
107+
raise HTTPException(status_code=500, detail="Database connection error")
108+
109+
@router.get("/count")
110+
async def get_collection_count(collection_name: str):
111+
"""
112+
Get the total number of items stored in a collection.
113+
"""
114+
coll = chroma_db.load_collection(collection_name)
115+
if coll:
116+
return {"collection": collection_name, "count": coll.count()}
117+
raise HTTPException(status_code=404, detail=f"Collection '{collection_name}' not found")
118+
119+
@router.delete("/drop-collection")
120+
async def drop_collection(collection_name: str):
121+
"""
122+
Completely delete a collection and all its data. Use with caution.
123+
"""
124+
try:
125+
chroma_db.client.delete_collection(name=collection_name)
126+
return {"status": "success", "message": f"Collection '{collection_name}' deleted"}
127+
except Exception as e:
128+
raise HTTPException(status_code=400, detail=str(e))

0 commit comments

Comments
 (0)