|
1 | | -from typing import Optional, Union |
| 1 | +from typing import List, Literal, Optional, Union |
2 | 2 | from uuid import UUID |
3 | 3 |
|
4 | | -from fastapi import APIRouter, Depends, Path, Query, Request, Response, Security |
| 4 | +from fastapi import APIRouter, Depends, Path, Query, Request, Response, Security, UploadFile |
5 | 5 | from fastapi.responses import JSONResponse |
6 | 6 | from sqlalchemy.ext.asyncio import AsyncSession |
7 | 7 |
|
8 | 8 | from app.helpers import AccessController |
9 | | -from app.schemas.documents import Document, Documents |
| 9 | +from app.schemas.documents import ( |
| 10 | + ChunkerName, |
| 11 | + ChunkerNameForm, |
| 12 | + ChunkMinSizeForm, |
| 13 | + ChunkOverlapForm, |
| 14 | + ChunkSizeForm, |
| 15 | + CollectionForm, |
| 16 | + Document, |
| 17 | + DocumentResponse, |
| 18 | + Documents, |
| 19 | + IsSeparatorRegexForm, |
| 20 | + LengthFunctionForm, |
| 21 | + MetadataForm, |
| 22 | + SeparatorsForm, |
| 23 | +) |
| 24 | +from app.schemas.parse import ( |
| 25 | + FileForm, |
| 26 | + ForceOCRForm, |
| 27 | + Languages, |
| 28 | + LanguagesForm, |
| 29 | + OutputFormatForm, |
| 30 | + PageRangeForm, |
| 31 | + PaginateOutputForm, |
| 32 | + ParsedDocumentOutputFormat, |
| 33 | + UseLLMForm, |
| 34 | +) |
10 | 35 | from app.sql.session import get_db as get_session |
11 | 36 | from app.utils.context import global_context, request_context |
12 | | -from app.utils.exceptions import CollectionNotFoundException, DocumentNotFoundException |
| 37 | +from app.utils.exceptions import CollectionNotFoundException, DocumentNotFoundException, FileSizeLimitExceededException |
13 | 38 | from app.utils.variables import ENDPOINT__DOCUMENTS |
14 | 39 |
|
15 | 40 | router = APIRouter() |
16 | 41 |
|
17 | 42 |
|
18 | | -@router.get(path=ENDPOINT__DOCUMENTS + "/{document:path}", dependencies=[Security(dependency=AccessController())], status_code=200, response_model=Document) # fmt: off |
| 43 | +@router.post(path=ENDPOINT__DOCUMENTS, status_code=201, dependencies=[Security(dependency=AccessController())], response_model=DocumentResponse) |
| 44 | +async def create_document( |
| 45 | + request: Request, |
| 46 | + session: AsyncSession = Depends(get_session), |
| 47 | + file: UploadFile = FileForm, |
| 48 | + collection: int = CollectionForm, |
| 49 | + paginate_output: Optional[bool] = PaginateOutputForm, |
| 50 | + page_range: str = PageRangeForm, |
| 51 | + languages: Optional[Languages] = LanguagesForm, |
| 52 | + force_ocr: bool = ForceOCRForm, |
| 53 | + output_format: ParsedDocumentOutputFormat = OutputFormatForm, |
| 54 | + use_llm: Optional[bool] = UseLLMForm, |
| 55 | + chunker_name: ChunkerName = ChunkerNameForm, |
| 56 | + chunk_size: int = ChunkSizeForm, |
| 57 | + chunk_overlap: int = ChunkOverlapForm, |
| 58 | + length_function: Literal["len"] = LengthFunctionForm, |
| 59 | + is_separator_regex: bool = IsSeparatorRegexForm, |
| 60 | + separators: List[str] = SeparatorsForm, |
| 61 | + chunk_min_size: int = ChunkMinSizeForm, |
| 62 | + metadata: str = MetadataForm, |
| 63 | +) -> JSONResponse: |
| 64 | + """ |
| 65 | + Parse a file and create a document. |
| 66 | + """ |
| 67 | + if not global_context.documents: # no vector store available |
| 68 | + raise CollectionNotFoundException() |
| 69 | + |
| 70 | + file_size = len(file.file.read()) |
| 71 | + if file_size > FileSizeLimitExceededException.MAX_CONTENT_SIZE: |
| 72 | + raise FileSizeLimitExceededException() |
| 73 | + file.file.seek(0) # reset file pointer to the beginning of the file |
| 74 | + |
| 75 | + length_function = len if length_function == "len" else length_function |
| 76 | + |
| 77 | + document = await global_context.parser.parse_file( |
| 78 | + file=file, |
| 79 | + collection=collection, |
| 80 | + paginate_output=paginate_output, |
| 81 | + page_range=page_range, |
| 82 | + languages=languages, |
| 83 | + force_ocr=force_ocr, |
| 84 | + output_format=output_format, |
| 85 | + use_llm=use_llm, |
| 86 | + ) |
| 87 | + |
| 88 | + document_id = await global_context.documents.create_document( |
| 89 | + user_id=request_context.get().user_id, |
| 90 | + session=session, |
| 91 | + collection_id=collection, |
| 92 | + document=document, |
| 93 | + chunker_name=chunker_name, |
| 94 | + chunk_size=chunk_size, |
| 95 | + chunk_overlap=chunk_overlap, |
| 96 | + length_function=length_function, |
| 97 | + is_separator_regex=is_separator_regex, |
| 98 | + separators=separators, |
| 99 | + chunk_min_size=chunk_min_size, |
| 100 | + metadata=metadata, |
| 101 | + ) |
| 102 | + |
| 103 | + return JSONResponse(content=DocumentResponse(id=document_id).model_dump(), status_code=201) |
| 104 | + |
| 105 | + |
| 106 | +@router.get( |
| 107 | + path=ENDPOINT__DOCUMENTS + "/{document:path}", |
| 108 | + dependencies=[Security(dependency=AccessController())], |
| 109 | + status_code=200, |
| 110 | + response_model=Document, |
| 111 | +) |
19 | 112 | async def get_document( |
20 | 113 | request: Request, |
21 | 114 | document: int = Path(description="The document ID"), |
@@ -68,7 +161,7 @@ async def delete_document( |
68 | 161 | session: AsyncSession = Depends(get_session), |
69 | 162 | ) -> Response: |
70 | 163 | """ |
71 | | - Delete a document and relative collections. |
| 164 | + Delete a document. |
72 | 165 | """ |
73 | 166 | if not global_context.documents: # no vector store available |
74 | 167 | raise DocumentNotFoundException() |
|
0 commit comments