Skip to content

Commit 8ec6af3

Browse files
authored
fix(documents): add document chunk upload to max 20MB per document (#902)
* fix(documents): add document chunk upload to max 20MB per document * Update unit coverage badge --------- Co-authored-by: leoguillaume <leoguillaume@users.noreply.github.com>
1 parent d41cd2b commit 8ec6af3

3 files changed

Lines changed: 22 additions & 14 deletions

File tree

.github/badges/coverage.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"schemaVersion":1,"label":"coverage","message":"56.15%","color":"red"}
1+
{"schemaVersion":1,"label":"coverage","message":"56.11%","color":"red"}

api/helpers/_documentmanager.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
ChunkingFailedException,
3131
CollectionNotFoundException,
3232
DocumentNotFoundException,
33+
FileSizeLimitExceededException,
3334
InsufficientStorageLimitException,
3435
ParsingDocumentFailedException,
3536
VectorizationFailedException,
@@ -201,7 +202,7 @@ async def create_document(
201202
# parse the file
202203
try:
203204
content = await self.parser_manager.parse(file=file)
204-
document_size = len(content.encode(encoding="utf-8"))
205+
document_size = len(content)
205206
except Exception as e:
206207
logger.exception(f"failed to parse {document_name} ({e}).")
207208
raise ParsingDocumentFailedException()
@@ -399,13 +400,26 @@ async def create_document_chunks(
399400
for i, chunk in enumerate(chunks, start=start)
400401
]
401402

402-
chunks_size = sum(len(chunk.content.encode(encoding="utf-8")) for chunk in chunks)
403+
chunks_size = sum(len(chunk.content) for chunk in chunks)
403404
storage_limit, storage_consumption = await self._get_storage_limit_and_consumption(postgres_session=postgres_session, user_id=user_id)
404405
if storage_limit is not None and storage_consumption > storage_limit:
405406
raise InsufficientStorageLimitException(
406407
detail=f"Upload size limit exceeded. Limit: {storage_limit} bytes. Current: {storage_consumption} bytes."
407408
)
408409

410+
# update the document size
411+
result = await postgres_session.execute(
412+
statement=update(table=DocumentTable)
413+
.values(size=func.coalesce(DocumentTable.size, 0) + chunks_size)
414+
.where(DocumentTable.id == document_id)
415+
.returning(DocumentTable.size)
416+
)
417+
new_size = result.scalar_one()
418+
419+
if new_size > FileSizeLimitExceededException.MAX_CONTENT_SIZE:
420+
await postgres_session.rollback()
421+
raise FileSizeLimitExceededException()
422+
409423
try:
410424
await self._upsert_document_chunks(
411425
chunks=chunks,
@@ -417,17 +431,11 @@ async def create_document_chunks(
417431
request_context=request_context,
418432
)
419433
except Exception as e:
434+
await postgres_session.rollback()
420435
raise VectorizationFailedException(detail=f"Vectorization failed: {e}")
421436

422-
chunk_ids = [chunk.id for chunk in chunks]
423-
424-
# update the document size
425-
await postgres_session.execute(
426-
statement=update(table=DocumentTable)
427-
.values(size=func.coalesce(DocumentTable.size, 0) + chunks_size)
428-
.where(DocumentTable.id == document_id)
429-
)
430437
await postgres_session.commit()
438+
chunk_ids = [chunk.id for chunk in chunks]
431439

432440
return chunk_ids
433441

@@ -478,7 +486,7 @@ async def get_document_chunks(
478486
statement=select(DocumentTable)
479487
.join(CollectionTable, DocumentTable.collection_id == CollectionTable.id)
480488
.where(DocumentTable.id == document_id)
481-
.where(CollectionTable.user_id == user_id)
489+
.where(or_(CollectionTable.user_id == user_id, CollectionTable.visibility == CollectionVisibility.PUBLIC))
482490
)
483491
try:
484492
result.scalar_one()

api/utils/exceptions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,14 +218,14 @@ def __init__(self, detail: str = "Cannot change the role of the last admin user.
218218

219219
# 413
220220
class FileSizeLimitExceededException(HTTPException):
221-
MAX_CONTENT_SIZE = 20 * 1024 * 1024 # 20MB
221+
MAX_CONTENT_SIZE = 20_000_000 # 20MB
222222

223223
def __init__(self, detail: str = f"File size limit exceeded (max: {MAX_CONTENT_SIZE} bytes).") -> None:
224224
super().__init__(status_code=413, detail=detail)
225225

226226

227227
class ChunksContentSizeLimitExceededException(HTTPException):
228-
MAX_CONTENT_SIZE = 20 * 1024 * 1024 # 20MB
228+
MAX_CONTENT_SIZE = 20_000_000 # 20MB
229229

230230
def __init__(self, detail: str = f"Total chunks content size limit exceeded (max: {MAX_CONTENT_SIZE} bytes).") -> None:
231231
super().__init__(status_code=413, detail=detail)

0 commit comments

Comments
 (0)