Skip to content

Commit aebd5f2

Browse files
update readme and text file ingest (#2368)
1 parent 25f0c99 commit aebd5f2

File tree

6 files changed

+177
-102
lines changed

6 files changed

+177
-102
lines changed

education-ai-suite/smart-classroom/content_search/README.md

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,18 @@ python .\start_services.py
2525

2626
## API Endpoints
2727

28-
| Endpoint | Method | Pattern | Description | Status |
29-
| :--- | :---: | :---: | :--- | :---: |
30-
| `/api/v1/system/health` | **GET** | SYNC | Backend app health check | DONE |
31-
| `/api/v1/task/query/{task_id}` | **GET** | SYNC | Query status of a specific task | DONE |
32-
| `/api/v1/task/list` | **GET** | SYNC | Query tasks by conditions (e.g., `?status=PROCESSING`) | DONE |
33-
| `/api/v1/object/upload` | **POST** | ASYNC | Upload a file to MinIO | DONE |
34-
| `/api/v1/object/ingest` | **POST** | ASYNC | Ingest a specific file from MinIO | DONE |
35-
| `/api/v1/object/ingest-text` | **POST** | ASYNC | Emedding a raw text | DONE |
36-
| `/api/v1/object/upload-ingest` | **POST** | ASYNC | Upload to MinIO and trigger ingestion | DONE |
37-
| `/api/v1/object/search` | **POST** | ASYNC | Search for files based on description | DONE |
38-
| `/api/v1/object/download` | **POST** | STREAM | Download file from MinIO | DONE |
39-
40-
## API reference
41-
[Content Search API reference](./docs/dev_guide/Content_search_API.md)
42-
28+
| Endpoint | Method | Pattern | Description |
29+
| :--- | :---: | :---: | :--- |
30+
| `/api/v1/task/query/{task_id}` | **GET** | SYNC | **Task Status Inspection**: Retrieves real-time metadata for a specific job, including current lifecycle state (e.g. PROCESSING, COMPLETED, FAILED), and error logs if applicable. |
31+
| `/api/v1/task/list` | **GET** | SYNC | **Batch Task Retrieval**: Queries task records. Supports filtering via query parameters (e.g., `?status=PROCESSING`) for monitoring system load and pipeline efficiency. |
32+
| `/api/v1/object/ingest-text` | **POST** | ASYNC | **Text-Specific Ingestion**: Primarily processes raw text strings passed in the request body for semantic indexing. It also supports fetching content from existing text-based objects in MinIO. |
33+
| `/api/v1/object/upload-ingest` | **POST** | ASYNC | **Atomic Upload & Ingestion**: A unified workflow that first saves the file to MinIO and then immediately initiates the ingestion pipeline. Features full content indexing and AI-driven Video Summarization for supported video formats. |
34+
| `/api/v1/object/search` | **POST** | SYNC | **Semantic Content Retrieval**: Executes a similarity search across vector collections using either natural language queries or base64-encoded images. Returns ranked results with associated metadata and MinIO object references. |
35+
| `/api/v1/object/download` | **POST** | STREAM | **Original File Download**: Securely fetches the raw source file directly from MinIO storage. Utilizes stream-bridging to pipe binary data to the client. |
36+
37+
For detailed descriptions and examples of each endpoint, please refer to the: [Content Search API reference](./docs/dev_guide/Content_search_API.md)
38+
39+
## Components API reference
4340
[Ingest and Retrieve](./docs/dev_guide/file_ingest_and_retrieve/API_GUIDE.md)
4441

4542
[Video Preprocess](./docs/dev_guide/video_preprocess/API_GUIDE.md)

education-ai-suite/smart-classroom/content_search/api/v1/endpoints/object.py

Lines changed: 9 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -76,24 +76,21 @@ async def ingest_existing_file(
7676
)
7777

7878
class IngestTextRequest(BaseModel):
79-
text: str
80-
bucket_name: Optional[str] = None
81-
file_path: Optional[str] = None
79+
text: Optional[str] = None
80+
bucket_name: Optional[str] = "content-search"
81+
file_key: Optional[str] = None
8282
meta: Dict[str, Any] = Field(default_factory=dict)
83+
8384
@router.post("/ingest-text")
8485
async def ingest_raw_text(
8586
request: IngestTextRequest,
8687
background_tasks: BackgroundTasks,
8788
db: Session = Depends(get_db)
8889
):
89-
payload = request.model_dump()
90-
91-
if "tags" not in payload["meta"] or payload["meta"]["tags"] is None:
92-
payload["meta"]["tags"] = ["default"]
9390

9491
result = await task_service.handle_text_ingest(
9592
db,
96-
payload,
93+
request.model_dump(),
9794
background_tasks
9895
)
9996

@@ -102,7 +99,7 @@ async def ingest_raw_text(
10299
"task_id": str(result["task_id"]),
103100
"status": result["status"]
104101
},
105-
message="Text ingestion started"
102+
message="Text ingestion task created successfully"
106103
)
107104

108105
@router.post("/upload-ingest")
@@ -145,44 +142,11 @@ async def upload_file_with_ingest(
145142
message="Upload and Ingest started"
146143
)
147144

148-
# @router.post("/search")
149-
# async def file_search(payload: dict):
150-
# query = payload.get("query")
151-
# limit = payload.get("max_num_results", 3)
152-
# if not query:
153-
# raise HTTPException(status_code=400, detail="Query cannot be empty")
154-
155-
# search_data = await search_service.semantic_search(query, limit)
156-
# return resp_200(data=search_data, message="Resource found")
157-
158145
@router.post("/search")
159-
async def file_search(payload: dict):
160-
query = payload.get("query")
161-
image_base64 = payload.get("image_base64")
162-
filters = payload.get("filter")
163-
limit = payload.get("max_num_results", 10)
164-
165-
if not query and not image_base64:
166-
raise HTTPException(status_code=400, detail="Either 'query' or 'image_base64' must be provided")
167-
168-
if query and image_base64:
169-
raise HTTPException(status_code=400, detail="Provide only one of 'query' or 'image_base64'")
170-
171-
search_payload = {
172-
"max_num_results": limit
173-
}
174-
175-
if query:
176-
search_payload["query"] = query
177-
else:
178-
search_payload["image_base64"] = image_base64
179-
180-
if filters:
181-
search_payload["filter"] = filters
182-
183-
search_data = await search_service.semantic_search(search_payload)
146+
async def file_search(payload: dict, db: Session = Depends(get_db)):
147+
result = await task_service.handle_sync_search(db, payload)
184148

185-
return resp_200(data=search_data, message="Search completed")
149+
return resp_200(data=result, message="Search completed")
186150

187151
@router.get("/download")
188152
async def download_file(file_key: str):

education-ai-suite/smart-classroom/content_search/docs/dev_guide/Content_search_API.md

Lines changed: 83 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ Content-Type: application/json
2222
2323
{
2424
"code": 20000,
25-
"data": { "task_id": "0892f506-4087-4d7e-b890-21303145b4ee" },
25+
"data": { "task_id": "0892f506-4087-4d7e-b890-21303145b4ee", "status": "PROCESSING" },
2626
"message": "Operation Successful",
2727
"timestamp": 167890123
2828
}
@@ -77,8 +77,8 @@ stateDiagram-v2
7777
```
7878

7979
## API Endpoints
80-
81-
### Get Task List
80+
### Task endpoints
81+
#### Get Task List
8282

8383
* URL: /api/v1/task/list
8484

@@ -142,7 +142,7 @@ Response (200 OK)
142142
"timestamp": 1774330753
143143
}
144144
```
145-
### Task Status Polling
145+
#### Task Status Polling
146146
Used to track the progress and retrieve the final result of a submitted task.
147147

148148
* URL: /api/v1/task/query/{task_id}
@@ -153,39 +153,50 @@ Used to track the progress and retrieve the final result of a submitted task.
153153

154154
Request:
155155
```
156-
curl --location 'http://127.0.0.1:9011/api/v1/task/query/56cc417c-9524-41a9-a500-9f0c44a05eac'
156+
curl --location 'http://127.0.0.1:9011/api/v1/task/query/6b9a6a55-d327-42fe-b05e-e0f3098fe797'
157157
```
158158

159159
Response (200 OK):
160160
```json
161161
{
162162
"code": 20000,
163163
"data": {
164-
"task_id": "e557b305-e37c-4074-a04a-ebd067efbd5d",
164+
"task_id": "6b9a6a55-d327-42fe-b05e-e0f3098fe797",
165165
"status": "COMPLETED",
166166
"progress": 100,
167167
"result": {
168-
"message": "File from MinIO successfully processed. db returns {'visual': {'insert_count': 1}}",
169-
"video_summary": {
170-
"type": "done",
171-
"job_id": "bc6513aa-e118-4945-84a8-02922595044e",
172-
"run_id": "5e405f58-03cf-4e44-9e10-85741283587a",
173-
"asset_id": "classroom_8.mp4",
174-
"total_chunks": 1,
175-
"succeeded_chunks": 1,
176-
"failed_chunks": 0,
177-
"ingest_ok_chunks": 1,
178-
"ingest_failed_chunks": 0,
179-
"elapsed_seconds": 36.89442276954651
168+
"message": "Upload only, no ingest requested",
169+
"file_info": {
170+
"source": "minio",
171+
"file_key": "runs/9e96f16a-9689-4c25-a515-04a1040b193f/raw/text/default/phy_class.txt",
172+
"bucket": "content-search",
173+
"filename": "phy_class.txt",
174+
"run_id": "9e96f16a-9689-4c25-a515-04a1040b193f"
180175
}
181176
}
182177
},
183178
"message": "Query successful",
184-
"timestamp": 1774879431
179+
"timestamp": 1774931711
185180
}
186181
```
182+
### File Process
183+
#### File Support Matrix
187184

188-
### File Upload
185+
The system supports the following file formats for all ingestion and upload-ingest operations.
186+
187+
| Category | Supported Extensions | Processing Logic |
188+
| :--- | :--- | :--- |
189+
| **Video** | `.mp4` | Frame extraction, AI-driven summarization, and semantic indexing. |
190+
| **Document** | `.txt`, `.pdf`, `.docx`, `.doc`, `.pptx`, `.ppt`, `.xlsx`, `.xls` | Full-text extraction, semantic chunking, and vector embedding. |
191+
| **Web/Markup** | `.html`, `.htm`, `.xml`, `.md`, `.rst` | Structured text parsing and content indexing. |
192+
| **Image** | `.jpg`, `.png`, `.jpeg` | Visual feature embedding and similarity search indexing. |
193+
194+
> **Technical Note**:
195+
> - **Video**: Default chunking is set to 30 seconds unless the `chunk_duration` parameter is provided.
196+
> - **Text**: Automatic semantic segmentation is applied to ensure high-quality retrieval results.
197+
> - **Max File Size**: Please refer to the `CS_MAX_CONTENT_LENGTH` environment variable (Default: 100MB).
198+
199+
#### File Upload
189200
Used to upload a video file and initiate an asynchronous background task.
190201

191202
* URL: /api/v1/object/upload
@@ -205,14 +216,14 @@ Response (200 OK):
205216
"code": 20000,
206217
"data": {
207218
"task_id": "c68211de-2187-4f52-b47d-f3a51a52b9ca",
208-
"status": "QUEUED"
219+
"status": "PROCESSING"
209220
},
210221
"message": "File received, processing started.",
211222
"timestamp": 1773909147
212223
}
213224
```
214225

215-
### File ingestion
226+
#### File ingestion
216227
* URL: /api/v1/object/ingest
217228
* Method: POST
218229
* Pattern: ASYNC
@@ -248,8 +259,49 @@ Response:
248259
"timestamp": 1774878031
249260
}
250261
```
262+
#### Text file ingestion
263+
Primarily processes raw text strings passed in the request body for semantic indexing. It also supports fetching content from existing text-based objects in MinIO.
264+
265+
* URL: /api/v1/object/ingest-text
266+
* Method: POST
267+
* Pattern: ASYNC
268+
* Parameters:
269+
270+
| Field | Type | Required | Default | Description |
271+
| :--- | :--- | :--- | :--- | :--- |
272+
| `text` | `string` | **Yes** || **Raw text content** to be segmented, embedded, and stored in the vector database. |
273+
| `bucket_name` | `string` | No || MinIO bucket name (used to logically group the data or build the identifier). |
274+
| `file_path` | `string` | No || Logical path or filename (used as a unique identifier for the text source). |
275+
| `meta` | `object` | No | `{}` | Extra metadata to store alongside the text (e.g., `course`, `author`, `tags`). |
276+
277+
Request:
278+
```
279+
# example for raw text content
280+
curl --location 'http://127.0.0.1:9011/api/v1/object/ingest-text' \
281+
--header 'Content-Type: application/json' \
282+
--data '{
283+
"text": "Newton'\''s Second Law of Motion states that the force acting on an object is equal to the mass of that object multiplied by its acceleration (F = ma). This relationship describes how the velocity of an object changes when it is subjected to an external force.",
284+
"meta": {
285+
"source": "topic-search"
286+
}
287+
}'
288+
```
289+
Response:
290+
```json
291+
{
292+
"code": 20000,
293+
"data": {
294+
"task_id": "df3caeb3-3287-4e41-a1f0-098c90d08e03",
295+
"status": "PROCESSING"
296+
},
297+
"message": "Text ingestion task created successfully",
298+
"timestamp": 1775006765
299+
}
300+
```
301+
302+
#### File upload and ingestion
303+
A unified workflow that first saves the file to MinIO and then immediately initiates the ingestion pipeline. Features full content indexing and AI-driven Video Summarization for supported video formats.
251304

252-
### File upload ana ingestion
253305
* URL: /api/v1/object/upload-ingest
254306
* Method: POST
255307
* Content-Type: multipart/form-data
@@ -263,6 +315,7 @@ Response:
263315
| chunk_duration | integer | No | Segment duration in seconds (passed as a Form field). |
264316
| meta | string | No | JSON string of metadata (e.g., '{"course": "CS101"}'). |
265317

318+
* Example:
266319
Request:
267320
```
268321
curl --location 'http://127.0.0.1:9011/api/v1/object/upload-ingest' \
@@ -283,11 +336,13 @@ Response (200 OK):
283336
}
284337
```
285338

286-
### Retrieve and Search
339+
#### Retrieve and Search
340+
Executes a similarity search across vector collections using either natural language queries or base64-encoded images. Returns ranked results with associated metadata and MinIO object references.
341+
287342
* URL: /api/v1/object/search
288343
* Method: POST
289344
* Content-Type: multipart/form-data
290-
* Pattern: ASYNC
345+
* Pattern: SYNC
291346
* Parameters:
292347

293348
| Field | Type | Required | Description |
@@ -297,8 +352,9 @@ Response (200 OK):
297352
| max_num_results | integer | No | Maximum number of results to return. Defaults to 10. |
298353
| filter | object | No | Metadata filters (e.g., {"run_id": "...", "tags": ["class"]}). |
299354

355+
* Example:
300356
Request:
301-
```json
357+
```
302358
curl --location 'http://127.0.0.1:9011/api/v1/object/search' \
303359
--header 'Content-Type: application/json' \
304360
--data '{
@@ -346,7 +402,7 @@ Response (200 OK):
346402
"timestamp": 1774877744
347403
}
348404
```
349-
### Resource Download (Video/Image/Document)
405+
#### Resource Download (Video/Image/Document)
350406
Download existing resources in Minio.
351407

352408
* URL: /api/v1/object/download/{resource_id}

education-ai-suite/smart-classroom/content_search/utils/storage_service.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import uuid
77
import logging
88
from fastapi import UploadFile
9+
from typing import Optional
910

1011
logger = logging.getLogger(__name__)
1112

@@ -24,10 +25,10 @@ def _try_initialize(self):
2425
self._error_msg = None
2526
except (ImportError, ModuleNotFoundError) as e:
2627
self._error_msg = f"Component missing: {str(e)}"
27-
logger.error(f"MinIO component load failed: {self._error_msg}")
28+
logger.error(f"MinIO component load failed: {self._error_msg}")
2829
except Exception as e:
2930
self._error_msg = f"Initialization failed: {str(e)}"
30-
logger.error(f"MinIO connection failed: {self._error_msg}")
31+
logger.error(f"MinIO connection failed: {self._error_msg}")
3132

3233
@property
3334
def is_available(self) -> bool:
@@ -63,7 +64,17 @@ async def get_file_stream(self, file_key: str):
6364
response = self._store.client.get_object(self._store.bucket, file_key)
6465
return response
6566
except Exception as e:
66-
logger.error(f"❌ Failed to get file {file_key}: {str(e)}")
67+
logger.error(f"Failed to get file {file_key}: {str(e)}")
68+
raise e
69+
70+
async def get_file_content(self, file_key: str, bucket_name: Optional[str] = None) -> bytes:
71+
if not self.is_available:
72+
raise RuntimeError(f"Storage Service is unavailable: {self._error_msg}")
73+
target_bucket = bucket_name or self._store.bucket
74+
try:
75+
return self._store.get_bytes(file_key)
76+
except Exception as e:
77+
logger.error(f"Failed to read content for {file_key}: {str(e)}")
6778
raise e
6879

6980
storage_service = StorageService()

0 commit comments

Comments
 (0)