@@ -51,19 +51,34 @@ async def process_document(
5151
5252 filename = unquote (x_filename ) if x_filename else "document"
5353 mime_type = content_type or "application/octet-stream"
54+ file_ext = filename .rsplit ("." , 1 )[- 1 ].lower () if "." in filename else ""
5455
5556 log .info ("Processing '%s' (%s, %d bytes)" , filename , mime_type , len (file_bytes ))
5657
57- # ── Call Docling (embedded images) ────────────────────────────────────────
58- try :
59- raw_markdown = await fetch_markdown_with_images (
60- file_bytes = file_bytes ,
61- filename = filename ,
62- mime_type = mime_type ,
63- )
64- except Exception as exc :
65- log .error ("Docling error: %s" , exc )
66- raise HTTPException (status_code = 502 , detail = f"Docling error: { exc } " )
58+ # ── Short-circuit for plain text / markdown ───────────────────────────────
59+ # Skip Docling (no conversion needed) but still run image processing and
60+ # text cleanup — the file may already contain embedded base64 images.
61+ is_text = (
62+ file_ext in ("txt" , "md" )
63+ or (mime_type .startswith ("text/" ) and "html" not in mime_type )
64+ )
65+ if is_text :
66+ log .info ("Text file detected — skipping Docling, processing content directly." )
67+ try :
68+ raw_markdown = file_bytes .decode ("utf-8" , errors = "replace" )
69+ except Exception :
70+ raw_markdown = file_bytes .decode ("latin-1" , errors = "replace" )
71+ else :
72+ # ── Call Docling (embedded images) ────────────────────────────────────
73+ try :
74+ raw_markdown = await fetch_markdown_with_images (
75+ file_bytes = file_bytes ,
76+ filename = filename ,
77+ mime_type = mime_type ,
78+ )
79+ except Exception as exc :
80+ log .error ("Docling error: %s" , exc )
81+ raise HTTPException (status_code = 502 , detail = f"Docling error: { exc } " )
6782
6883 # ── Extract base64 images → Azure Blob URLs ───────────────────────────────
6984 try :
0 commit comments