Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 12 additions & 15 deletions education-ai-suite/smart-classroom/content_search/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,18 @@ python .\start_services.py

## API Endpoints

| Endpoint | Method | Pattern | Description | Status |
| :--- | :---: | :---: | :--- | :---: |
| `/api/v1/system/health` | **GET** | SYNC | Backend app health check | DONE |
| `/api/v1/task/query/{task_id}` | **GET** | SYNC | Query status of a specific task | DONE |
| `/api/v1/task/list` | **GET** | SYNC | Query tasks by conditions (e.g., `?status=PROCESSING`) | DONE |
| `/api/v1/object/upload` | **POST** | ASYNC | Upload a file to MinIO | DONE |
| `/api/v1/object/ingest` | **POST** | ASYNC | Ingest a specific file from MinIO | DONE |
| `/api/v1/object/ingest-text` | **POST** | ASYNC | Emedding a raw text | DONE |
| `/api/v1/object/upload-ingest` | **POST** | ASYNC | Upload to MinIO and trigger ingestion | DONE |
| `/api/v1/object/search` | **POST** | ASYNC | Search for files based on description | DONE |
| `/api/v1/object/download` | **POST** | STREAM | Download file from MinIO | DONE |

## API reference
[Content Search API reference](./docs/dev_guide/Content_search_API.md)

| Endpoint | Method | Pattern | Description |
| :--- | :---: | :---: | :--- |
| `/api/v1/task/query/{task_id}` | **GET** | SYNC | **Task Status Inspection**: Retrieves real-time metadata for a specific job, including current lifecycle state (PENDING, PROCESSING, COMPLETED, FAILED), and error logs if applicable. |
| `/api/v1/task/list` | **GET** | SYNC | **Batch Task Retrieval**: Queries task records. Supports filtering via query parameters (e.g., `?status=PROCESSING`) for monitoring system load and pipeline efficiency. |
| `/api/v1/object/ingest-text` | **POST** | ASYNC | **Text-Specific Ingestion**: Primarily processes raw text strings passed in the request body for semantic indexing. It also supports fetching content from existing text-based objects in MinIO. |
| `/api/v1/object/upload-ingest` | **POST** | ASYNC | **Atomic Upload & Ingestion**: A unified workflow that first saves the file to MinIO and then immediately initiates the ingestion pipeline. Features full content indexing and AI-driven Video Summarization for supported video formats. |
| `/api/v1/object/search` | **POST** | SYNC | **Semantic Content Retrieval**: Executes a similarity search across vector collections using either natural language queries or base64-encoded images. Returns ranked results with associated metadata and MinIO object references. |
| `/api/v1/object/download` | **POST** | STREAM | **Original File Download**: Securely fetches the raw source file directly from MinIO storage. Utilizes stream-bridging to pipe binary data to the client. |

For detailed descriptions and examples of each endpoint, please refer to the: [Content Search API reference](./docs/dev_guide/Content_search_API.md)

## Components API reference
[Ingest and Retrieve](./docs/dev_guide/file_ingest_and_retrieve/API_GUIDE.md)

[Video Preprocess](./docs/dev_guide/video_preprocess/API_GUIDE.md)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,24 +76,21 @@ async def ingest_existing_file(
)

class IngestTextRequest(BaseModel):
text: str
bucket_name: Optional[str] = None
file_path: Optional[str] = None
text: Optional[str] = None
bucket_name: Optional[str] = "content-search"
file_key: Optional[str] = None
meta: Dict[str, Any] = Field(default_factory=dict)

@router.post("/ingest-text")
async def ingest_raw_text(
request: IngestTextRequest,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db)
):
payload = request.model_dump()

if "tags" not in payload["meta"] or payload["meta"]["tags"] is None:
payload["meta"]["tags"] = ["default"]

result = await task_service.handle_text_ingest(
db,
payload,
request.model_dump(),
background_tasks
)

Expand All @@ -102,7 +99,7 @@ async def ingest_raw_text(
"task_id": str(result["task_id"]),
"status": result["status"]
},
message="Text ingestion started"
message="Text ingestion task created successfully"
)

@router.post("/upload-ingest")
Expand Down Expand Up @@ -145,44 +142,11 @@ async def upload_file_with_ingest(
message="Upload and Ingest started"
)

# @router.post("/search")
# async def file_search(payload: dict):
# query = payload.get("query")
# limit = payload.get("max_num_results", 3)
# if not query:
# raise HTTPException(status_code=400, detail="Query cannot be empty")

# search_data = await search_service.semantic_search(query, limit)
# return resp_200(data=search_data, message="Resource found")

@router.post("/search")
async def file_search(payload: dict):
query = payload.get("query")
image_base64 = payload.get("image_base64")
filters = payload.get("filter")
limit = payload.get("max_num_results", 10)

if not query and not image_base64:
raise HTTPException(status_code=400, detail="Either 'query' or 'image_base64' must be provided")

if query and image_base64:
raise HTTPException(status_code=400, detail="Provide only one of 'query' or 'image_base64'")

search_payload = {
"max_num_results": limit
}

if query:
search_payload["query"] = query
else:
search_payload["image_base64"] = image_base64

if filters:
search_payload["filter"] = filters

search_data = await search_service.semantic_search(search_payload)
async def file_search(payload: dict, db: Session = Depends(get_db)):
result = await task_service.handle_sync_search(db, payload)

return resp_200(data=search_data, message="Search completed")
return resp_200(data=result, message="Search completed")

@router.get("/download")
async def download_file(file_key: str):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ stateDiagram-v2
```

## API Endpoints

### Get Task List
### Task endpoints
#### Get Task List

* URL: /api/v1/task/list

Expand Down Expand Up @@ -142,7 +142,7 @@ Response (200 OK)
"timestamp": 1774330753
}
```
### Task Status Polling
#### Task Status Polling
Used to track the progress and retrieve the final result of a submitted task.

* URL: /api/v1/task/query/{task_id}
Expand All @@ -153,39 +153,50 @@ Used to track the progress and retrieve the final result of a submitted task.

Request:
```
curl --location 'http://127.0.0.1:9011/api/v1/task/query/56cc417c-9524-41a9-a500-9f0c44a05eac'
curl --location 'http://127.0.0.1:9011/api/v1/task/query/6b9a6a55-d327-42fe-b05e-e0f3098fe797'
```

Response (200 OK):
```json
{
"code": 20000,
"data": {
"task_id": "e557b305-e37c-4074-a04a-ebd067efbd5d",
"task_id": "6b9a6a55-d327-42fe-b05e-e0f3098fe797",
"status": "COMPLETED",
"progress": 100,
"result": {
"message": "File from MinIO successfully processed. db returns {'visual': {'insert_count': 1}}",
"video_summary": {
"type": "done",
"job_id": "bc6513aa-e118-4945-84a8-02922595044e",
"run_id": "5e405f58-03cf-4e44-9e10-85741283587a",
"asset_id": "classroom_8.mp4",
"total_chunks": 1,
"succeeded_chunks": 1,
"failed_chunks": 0,
"ingest_ok_chunks": 1,
"ingest_failed_chunks": 0,
"elapsed_seconds": 36.89442276954651
"message": "Upload only, no ingest requested",
"file_info": {
"source": "minio",
"file_key": "runs/9e96f16a-9689-4c25-a515-04a1040b193f/raw/text/default/phy_class.txt",
"bucket": "content-search",
"filename": "phy_class.txt",
"run_id": "9e96f16a-9689-4c25-a515-04a1040b193f"
}
}
},
"message": "Query successful",
"timestamp": 1774879431
"timestamp": 1774931711
}
```
### File Process
#### File Support Matrix

The system supports the following file formats for all ingestion and upload-ingest operations.

### File Upload
| Category | Supported Extensions | Processing Logic |
| :--- | :--- | :--- |
| **Video** | `.mp4` | Frame extraction, AI-driven summarization, and semantic indexing. |
| **Document** | `.txt`, `.pdf`, `.docx`, `.doc`, `.pptx`, `.ppt`, `.xlsx`, `.xls` | Full-text extraction, semantic chunking, and vector embedding. |
| **Web/Markup** | `.html`, `.htm`, `.xml`, `.md`, `.rst` | Structured text parsing and content indexing. |
| **Image** | `.jpg`, `.png`, `.jpeg` | Visual feature embedding and similarity search indexing. |

> **Technical Note**:
> - **Video**: Default chunking is set to 30 seconds unless the `chunk_duration` parameter is provided.
> - **Text**: Automatic semantic segmentation is applied to ensure high-quality retrieval results.
> - **Max File Size**: Please refer to the `CS_MAX_CONTENT_LENGTH` environment variable (Default: 100MB).

#### File Upload
Used to upload a video file and initiate an asynchronous background task.

* URL: /api/v1/object/upload
Expand All @@ -205,14 +216,14 @@ Response (200 OK):
"code": 20000,
"data": {
"task_id": "c68211de-2187-4f52-b47d-f3a51a52b9ca",
"status": "QUEUED"
"status": "PROCESSING"
},
"message": "File received, processing started.",
"timestamp": 1773909147
}
```

### File ingestion
#### File ingestion
* URL: /api/v1/object/ingest
* Method: POST
* Pattern: ASYNC
Expand Down Expand Up @@ -248,8 +259,47 @@ Response:
"timestamp": 1774878031
}
```
#### Text file ingestion
Used to trigger the ingestion pipeline for text-based documents (e.g., .txt, .pdf, .docx) that already exist in MinIO.

* URL: /api/v1/object/ingest-text
* Method: POST
* Pattern: ASYNC
* Parameters:

| Field | Type | Required | Default | Description |
| :--- | :--- | :--- | :--- | :--- |
| `text` | `string` | **Yes** | — | **Raw text content** to be segmented, embedded, and stored in the vector database. |
| `bucket_name` | `string` | No | — | MinIO bucket name (used to logically group the data or build the identifier). |
| `file_path` | `string` | No | — | Logical path or filename (used as a unique identifier for the text source). |
| `meta` | `object` | No | `{}` | Extra metadata to store alongside the text (e.g., `course`, `author`, `tags`). |

Request:
```
<!-- example for raw text content -->
curl --location 'http://127.0.0.1:9011/api/v1/object/ingest-text' \
--header 'Content-Type: application/json' \
--data '{
"text": "Newton'\''s Second Law of Motion states that the force acting on an object is equal to the mass of that object multiplied by its acceleration (F = ma). This relationship describes how the velocity of an object changes when it is subjected to an external force.",
"meta": {
"source": "topic-search"
}
}'
```
Response:
```json
{
"code": 20000,
"data": {
"task_id": "f5eb96fd-9c75-4dee-a715-4d39b0762436",
"status": "PROCESSING"
},
"message": "Text ingestion task created successfully",
"timestamp": 1774933932
}
```

### File upload ana ingestion
#### File upload and ingestion
* URL: /api/v1/object/upload-ingest
* Method: POST
* Content-Type: multipart/form-data
Expand Down Expand Up @@ -283,11 +333,11 @@ Response (200 OK):
}
```

### Retrieve and Search
#### Retrieve and Search
* URL: /api/v1/object/search
* Method: POST
* Content-Type: multipart/form-data
* Pattern: ASYNC
* Pattern: SYNC
* Parameters:

| Field | Type | Required | Description |
Expand All @@ -298,7 +348,7 @@ Response (200 OK):
| filter | object | No | Metadata filters (e.g., {"run_id": "...", "tags": ["class"]}). |

Request:
```json
```
curl --location 'http://127.0.0.1:9011/api/v1/object/search' \
--header 'Content-Type: application/json' \
--data '{
Expand Down Expand Up @@ -346,7 +396,7 @@ Response (200 OK):
"timestamp": 1774877744
}
```
### Resource Download (Video/Image/Document)
#### Resource Download (Video/Image/Document)
Download existing resources in Minio.

* URL: /api/v1/object/download/{resource_id}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import uuid
import logging
from fastapi import UploadFile
from typing import Optional

logger = logging.getLogger(__name__)

Expand All @@ -24,10 +25,10 @@ def _try_initialize(self):
self._error_msg = None
except (ImportError, ModuleNotFoundError) as e:
self._error_msg = f"Component missing: {str(e)}"
logger.error(f"MinIO component load failed: {self._error_msg}")
logger.error(f"MinIO component load failed: {self._error_msg}")
except Exception as e:
self._error_msg = f"Initialization failed: {str(e)}"
logger.error(f"MinIO connection failed: {self._error_msg}")
logger.error(f"MinIO connection failed: {self._error_msg}")

@property
def is_available(self) -> bool:
Expand Down Expand Up @@ -63,7 +64,17 @@ async def get_file_stream(self, file_key: str):
response = self._store.client.get_object(self._store.bucket, file_key)
return response
except Exception as e:
logger.error(f"❌ Failed to get file {file_key}: {str(e)}")
logger.error(f"Failed to get file {file_key}: {str(e)}")
raise e

async def get_file_content(self, file_key: str, bucket_name: Optional[str] = None) -> bytes:
if not self.is_available:
raise RuntimeError(f"Storage Service is unavailable: {self._error_msg}")
target_bucket = bucket_name or self._store.bucket
try:
return self._store.get_bytes(file_key)
except Exception as e:
logger.error(f"Failed to read content for {file_key}: {str(e)}")
raise e

storage_service = StorageService()
Loading
Loading