Skip to content

Commit 0c8bcff

Browse files
committed
feat: handle text files without docling
Enhance document processing to handle plain text and markdown files directly, bypassing Docling for efficiency.
1 parent d714cc9 commit 0c8bcff

File tree

1 file changed

+25
-10
lines changed

1 file changed

+25
-10
lines changed

app/main.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,19 +51,34 @@ async def process_document(
5151

5252
filename = unquote(x_filename) if x_filename else "document"
5353
mime_type = content_type or "application/octet-stream"
54+
file_ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
5455

5556
log.info("Processing '%s' (%s, %d bytes)", filename, mime_type, len(file_bytes))
5657

57-
# ── Call Docling (embedded images) ────────────────────────────────────────
58-
try:
59-
raw_markdown = await fetch_markdown_with_images(
60-
file_bytes=file_bytes,
61-
filename=filename,
62-
mime_type=mime_type,
63-
)
64-
except Exception as exc:
65-
log.error("Docling error: %s", exc)
66-
raise HTTPException(status_code=502, detail=f"Docling error: {exc}")
58+
# ── Short-circuit for plain text / markdown ───────────────────────────────
59+
# Skip Docling (no conversion needed) but still run image processing and
60+
# text cleanup — the file may already contain embedded base64 images.
61+
is_text = (
62+
file_ext in ("txt", "md")
63+
or (mime_type.startswith("text/") and "html" not in mime_type)
64+
)
65+
if is_text:
66+
log.info("Text file detected — skipping Docling, processing content directly.")
67+
try:
68+
raw_markdown = file_bytes.decode("utf-8", errors="replace")
69+
except Exception:
70+
raw_markdown = file_bytes.decode("latin-1", errors="replace")
71+
else:
72+
# ── Call Docling (embedded images) ────────────────────────────────────
73+
try:
74+
raw_markdown = await fetch_markdown_with_images(
75+
file_bytes=file_bytes,
76+
filename=filename,
77+
mime_type=mime_type,
78+
)
79+
except Exception as exc:
80+
log.error("Docling error: %s", exc)
81+
raise HTTPException(status_code=502, detail=f"Docling error: {exc}")
6782

6883
# ── Extract base64 images → Azure Blob URLs ───────────────────────────────
6984
try:

0 commit comments

Comments
 (0)