Skip to content

Commit f94e687

Browse files
feat(tutor): use tika to extract from pdf (#80)
1 parent 21d1afb commit f94e687

File tree

4 files changed

+12
-13
lines changed

4 files changed

+12
-13
lines changed

src/app/api/api_v1/endpoints/tutor.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from src.app.services.tutor.agents import TEMPLATES
1313
from src.app.services.tutor.models import (
1414
ExtractorOutputList,
15-
SummariesOutputModel,
1615
SyllabusFeedback,
1716
SyllabusResponse,
1817
SyllabusResponseAgent,
@@ -48,7 +47,7 @@
4847
@router.post("/files/content")
4948
async def extract_files_content(
5049
files: Annotated[list[UploadFile], File()],
51-
) -> SummariesOutputModel | None:
50+
) -> ExtractorOutputList | None:
5251
files_content = await get_files_content(files)
5352
files_content_str = ("__DOCUMENT_SEPARATOR__").join(files_content)
5453

@@ -67,7 +66,7 @@ async def extract_files_content(
6766
summaries = await chatfactory.chat_client.completion(messages=messages)
6867
assert isinstance(summaries, str)
6968
json_summaries = extract_json_from_response(summaries)
70-
summaries_output = SummariesOutputModel(**json_summaries)
69+
summaries_output = ExtractorOutputList(**json_summaries)
7170

7271
return summaries_output
7372

src/app/services/llm_proxy.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ async def completion(
5757
response_format: Optional[Union[dict, Type[BaseModel]]] = None,
5858
) -> dict | str:
5959

60+
logger.info("starting completion with model_name=%s", self.model)
61+
6062
if self.is_azure_model:
6163
return await self.az_completion(messages)
6264

src/app/services/tutor/models.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@ class ExtractorOutputList(BaseModel):
1414
extracts: list[ExtractorOutput]
1515

1616

17-
class SummariesOutputModel(BaseModel):
18-
summaries: list[str]
19-
20-
2117
class TutorSearchResponse(BaseModel):
2218
extracts: list[ExtractorOutput]
2319
nb_results: int

src/app/services/tutor/utils.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,11 @@
22

33
from docx import Document as DocxReader
44
from fastapi import HTTPException, UploadFile
5-
6-
# from src.app.services.pdf_extractor import extract_txt_from_pdf_with_tika
75
from pypdf import PdfReader
86
from qdrant_client.models import ScoredPoint
97

108
from src.app.api.dependencies import get_settings
9+
from src.app.services.pdf_extractor import extract_txt_from_pdf_with_tika
1110
from src.app.utils.decorators import log_time_and_error_sync
1211

1312
settings = get_settings()
@@ -97,11 +96,14 @@ async def get_file_content(file: UploadFile) -> str:
9796

9897

9998
async def _extract_pdf_content(file) -> str:
100-
reader = PdfReader(file.file)
101-
return "\n".join(page.extract_text() or "" for page in reader.pages)
102-
# content = extract_txt_from_pdf_with_tika(file.file, settings.TIKA_URL_BASE)
99+
content = ""
100+
try:
101+
content = extract_txt_from_pdf_with_tika(file.file, settings.TIKA_URL_BASE)
102+
except Exception:
103+
reader = PdfReader(file.file)
104+
content = "\n".join(page.extract_text() or "" for page in reader.pages)
103105

104-
# return content
106+
return content
105107

106108

107109
async def _extract_text_content(file) -> str:

0 commit comments

Comments
 (0)