Skip to content

Commit 662be27

Browse files
leoguillaumeKundun78leoguillaume
authored
Marker parsing v2 (#291)
* feat: ajout page endpoint parser * wip * wip 1 * change PyMyPDF parsing * fix: chunking * fix: wip * wip * wip * change collection for document * fix: tests and ocr signature * WIP: sauvegarde avant pull * Debug * add parsing test * remove key * Update coverage badge --------- Co-authored-by: Kundun78 <coralexandre055@gmail.com> Co-authored-by: leoguillaume <leo.guillaume@modernisation.gouv.fr> Co-authored-by: leoguillaume <leoguillaume@users.noreply.github.com>
1 parent a6f729c commit 662be27

49 files changed

Lines changed: 995 additions & 535 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/badges/coverage.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"schemaVersion":1,"label":"coverage","message":"88.36%","color":"green"}
1+
{"schemaVersion":1,"label":"coverage","message":"87.38%","color":"green"}

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,4 +208,4 @@ uv.lock
208208
app/tests/cassettes
209209

210210
.envrc
211-
.direnv/
211+
.direnv/

app/clients/parser/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from ._baseparserclient import BaseParserClient
2+
from ._markerparserclient import MarkerParserClient
3+
4+
__all__ = ["BaseParserClient", "MarkerParserClient"]
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from abc import ABC, abstractmethod
2+
import importlib
3+
from typing import Optional, Type
4+
5+
from fastapi import UploadFile
6+
7+
from app.schemas.core.settings import ParserType
8+
from app.schemas.parse import Languages, ParsedDocument, ParsedDocumentOutputFormat
9+
10+
11+
class BaseParserClient(ABC):
12+
SUPPORTED_FORMATS = []
13+
14+
@staticmethod
15+
def import_module(type: ParserType) -> "Type[BaseParserClient]":
16+
"""
17+
Import the module for the given parser type.
18+
"""
19+
module = importlib.import_module(f"app.clients.parser._{type.value}parserclient")
20+
return getattr(module, f"{type.capitalize()}ParserClient")
21+
22+
@abstractmethod
23+
def parse(
24+
self,
25+
file: UploadFile,
26+
output_format: Optional[ParsedDocumentOutputFormat] = None,
27+
force_ocr: bool = False,
28+
languages: Optional[Languages] = None,
29+
page_range: Optional[str] = None,
30+
paginate_output: Optional[bool] = None,
31+
use_llm: Optional[bool] = None,
32+
) -> ParsedDocument:
33+
pass
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
from io import BytesIO
2+
import json
3+
from typing import List, Optional
4+
5+
from fastapi import HTTPException
6+
import httpx
7+
import pymupdf
8+
9+
from app.schemas.core.documents import FileType, ParserParams
10+
from app.schemas.parse import ParsedDocument, ParsedDocumentMetadata, ParsedDocumentPage
11+
12+
from ._baseparserclient import BaseParserClient
13+
14+
15+
class MarkerParserClient(BaseParserClient):
16+
"""
17+
Class to interact with the Marker PDF API for document analysis.
18+
"""
19+
20+
SUPPORTED_FORMATS = [FileType.PDF]
21+
22+
def __init__(self, api_url: str, api_key: Optional[str] = None, timeout=120, *args, **kwargs) -> None:
23+
self.api_url = api_url
24+
self.api_key = api_key
25+
self.timeout = timeout
26+
self.headers = {"Authorization": f"Bearer {self.api_key}"} if self.api_key else {}
27+
28+
# Keep health check synchronous in __init__
29+
response = httpx.get(f"{self.api_url}/health", headers=self.headers, timeout=self.timeout)
30+
assert response.status_code == 200, "Marker API is not reachable."
31+
32+
def convert_page_range(self, page_range: str, page_count: int) -> List[int]:
33+
if page_range == "":
34+
return [i for i in range(page_count)]
35+
36+
page_ranges = page_range.split(",")
37+
pages = []
38+
for page_range in page_ranges:
39+
page_range = page_range.split("-")
40+
if len(page_range) == 1:
41+
pages.append(int(page_range[0]))
42+
else:
43+
for i in range(int(page_range[0]), int(page_range[1]) + 1):
44+
pages.append(i)
45+
46+
pages = list(set(pages))
47+
48+
return pages
49+
50+
async def parse(self, **params: ParserParams) -> ParsedDocument:
51+
params = ParserParams(**params)
52+
file_content = await params.file.read()
53+
54+
try:
55+
# Correct way to open PDF from bytes with PyMuPDF
56+
pdf = pymupdf.open(stream=file_content, filetype="pdf")
57+
page_count = pdf.page_count
58+
except Exception as e:
59+
# Handle corrupted or invalid PDF files
60+
raise HTTPException(status_code=400, detail=f"Invalid PDF file: {str(e)}")
61+
62+
data = []
63+
payload = {
64+
"output_format": params.output_format.value,
65+
"force_ocr": params.force_ocr,
66+
"languages": params.languages.value,
67+
"paginate_output": params.paginate_output,
68+
"use_llm": params.use_llm,
69+
}
70+
pages = self.convert_page_range(page_range=params.page_range, page_count=page_count)
71+
async with httpx.AsyncClient() as client:
72+
for i in pages:
73+
# Create a fresh BytesIO object for each request to avoid stream consumption issues
74+
files = {"file": (params.file.filename, BytesIO(file_content), "application/pdf")}
75+
payload["page_range"] = str(i)
76+
77+
response = await client.post(
78+
url=f"{self.api_url}/marker/upload",
79+
files=files,
80+
data=payload,
81+
headers=self.headers,
82+
timeout=self.timeout,
83+
)
84+
if response.status_code != 200:
85+
raise HTTPException(status_code=response.status_code, detail=json.loads(response.text).get("detail", "Parsing failed."))
86+
87+
result = response.json()
88+
if not result.get("success", False):
89+
raise HTTPException(status_code=500, detail=result.get("error", "Parsing failed."))
90+
91+
metadata = ParsedDocumentMetadata(document_name=params.file.filename, page=i, **result["metadata"])
92+
data.append(ParsedDocumentPage(content=result["output"], images=result["images"], metadata=metadata))
93+
94+
# Close the PDF document to free memory
95+
pdf.close()
96+
document = ParsedDocument(data=data)
97+
98+
return document

app/clients/web_search/_bravewebsearchclient.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import logging
2-
import traceback
32
from typing import List
43

54
import httpx
@@ -25,8 +24,7 @@ async def search(self, query: str, n: int = 3) -> List[str]:
2524
response = await client.get(url=self.URL, headers=self.headers, params=params)
2625
results = response.json().get("web", {}).get("results", [])
2726
except Exception:
28-
logger.error(msg="Brave Search API unreachable.")
29-
logger.debug(msg=traceback.format_exc())
27+
logger.exception(msg="Brave Search API unreachable.")
3028
results = []
3129

3230
return [result["url"].lower() for result in results]

app/clients/web_search/_duckduckgowebsearchclient.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import logging
2-
import traceback
32
from typing import List
43

54
import httpx
@@ -29,8 +28,7 @@ async def search(self, query: str, n: int = 3) -> List[str]:
2928
response = await client.get(url=self.URL, headers=self.headers, params=params, follow_redirects=True)
3029
results = response.json().get("Results", [])[:n]
3130
except Exception:
32-
logger.error(msg="DuckDuckGo API unreachable.")
33-
logger.debug(msg=traceback.format_exc())
31+
logger.exception(msg="DuckDuckGo API unreachable.")
3432
results = []
3533

3634
return [result["FirstURL"].lower() for result in results]

app/endpoints/documents.py

Lines changed: 99 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,114 @@
1-
from typing import Optional, Union
1+
from typing import List, Literal, Optional, Union
22
from uuid import UUID
33

4-
from fastapi import APIRouter, Depends, Path, Query, Request, Response, Security
4+
from fastapi import APIRouter, Depends, Path, Query, Request, Response, Security, UploadFile
55
from fastapi.responses import JSONResponse
66
from sqlalchemy.ext.asyncio import AsyncSession
77

88
from app.helpers import AccessController
9-
from app.schemas.documents import Document, Documents
9+
from app.schemas.documents import (
10+
ChunkerName,
11+
ChunkerNameForm,
12+
ChunkMinSizeForm,
13+
ChunkOverlapForm,
14+
ChunkSizeForm,
15+
CollectionForm,
16+
Document,
17+
DocumentResponse,
18+
Documents,
19+
IsSeparatorRegexForm,
20+
LengthFunctionForm,
21+
MetadataForm,
22+
SeparatorsForm,
23+
)
24+
from app.schemas.parse import (
25+
FileForm,
26+
ForceOCRForm,
27+
Languages,
28+
LanguagesForm,
29+
OutputFormatForm,
30+
PageRangeForm,
31+
PaginateOutputForm,
32+
ParsedDocumentOutputFormat,
33+
UseLLMForm,
34+
)
1035
from app.sql.session import get_db as get_session
1136
from app.utils.context import global_context, request_context
12-
from app.utils.exceptions import CollectionNotFoundException, DocumentNotFoundException
37+
from app.utils.exceptions import CollectionNotFoundException, DocumentNotFoundException, FileSizeLimitExceededException
1338
from app.utils.variables import ENDPOINT__DOCUMENTS
1439

1540
router = APIRouter()
1641

1742

18-
@router.get(path=ENDPOINT__DOCUMENTS + "/{document:path}", dependencies=[Security(dependency=AccessController())], status_code=200, response_model=Document) # fmt: off
43+
@router.post(path=ENDPOINT__DOCUMENTS, status_code=201, dependencies=[Security(dependency=AccessController())], response_model=DocumentResponse)
44+
async def create_document(
45+
request: Request,
46+
session: AsyncSession = Depends(get_session),
47+
file: UploadFile = FileForm,
48+
collection: int = CollectionForm,
49+
paginate_output: Optional[bool] = PaginateOutputForm,
50+
page_range: str = PageRangeForm,
51+
languages: Optional[Languages] = LanguagesForm,
52+
force_ocr: bool = ForceOCRForm,
53+
output_format: ParsedDocumentOutputFormat = OutputFormatForm,
54+
use_llm: Optional[bool] = UseLLMForm,
55+
chunker_name: ChunkerName = ChunkerNameForm,
56+
chunk_size: int = ChunkSizeForm,
57+
chunk_overlap: int = ChunkOverlapForm,
58+
length_function: Literal["len"] = LengthFunctionForm,
59+
is_separator_regex: bool = IsSeparatorRegexForm,
60+
separators: List[str] = SeparatorsForm,
61+
chunk_min_size: int = ChunkMinSizeForm,
62+
metadata: str = MetadataForm,
63+
) -> JSONResponse:
64+
"""
65+
Parse a file and create a document.
66+
"""
67+
if not global_context.documents: # no vector store available
68+
raise CollectionNotFoundException()
69+
70+
file_size = len(file.file.read())
71+
if file_size > FileSizeLimitExceededException.MAX_CONTENT_SIZE:
72+
raise FileSizeLimitExceededException()
73+
file.file.seek(0) # reset file pointer to the beginning of the file
74+
75+
length_function = len if length_function == "len" else length_function
76+
77+
document = await global_context.parser.parse_file(
78+
file=file,
79+
collection=collection,
80+
paginate_output=paginate_output,
81+
page_range=page_range,
82+
languages=languages,
83+
force_ocr=force_ocr,
84+
output_format=output_format,
85+
use_llm=use_llm,
86+
)
87+
88+
document_id = await global_context.documents.create_document(
89+
user_id=request_context.get().user_id,
90+
session=session,
91+
collection_id=collection,
92+
document=document,
93+
chunker_name=chunker_name,
94+
chunk_size=chunk_size,
95+
chunk_overlap=chunk_overlap,
96+
length_function=length_function,
97+
is_separator_regex=is_separator_regex,
98+
separators=separators,
99+
chunk_min_size=chunk_min_size,
100+
metadata=metadata,
101+
)
102+
103+
return JSONResponse(content=DocumentResponse(id=document_id).model_dump(), status_code=201)
104+
105+
106+
@router.get(
107+
path=ENDPOINT__DOCUMENTS + "/{document:path}",
108+
dependencies=[Security(dependency=AccessController())],
109+
status_code=200,
110+
response_model=Document,
111+
)
19112
async def get_document(
20113
request: Request,
21114
document: int = Path(description="The document ID"),
@@ -68,7 +161,7 @@ async def delete_document(
68161
session: AsyncSession = Depends(get_session),
69162
) -> Response:
70163
"""
71-
Delete a document and relative collections.
164+
Delete a document.
72165
"""
73166
if not global_context.documents: # no vector store available
74167
raise DocumentNotFoundException()

0 commit comments

Comments
 (0)